From c0164bc5605af5598f459542fd7a74de447629ec Mon Sep 17 00:00:00 2001
From: liuqi <liuqi10@xiaomi.com>
Date: Mon, 29 Oct 2018 18:06:09 +0800
Subject: [PATCH] Refactor: Ajusted the structure of MACE to support the mixing
 of CPU and GPU.

1. Merge the Operation and OpKernel into Operation.
2. Add OpDefRegistry for registering the definitions of Operations.
3. Add information for MaceStatus.
4. Unify 'BatchNorm' and 'FoldedBatchNorm' into 'BatchNorm'
5. Remove unused op 'Proposal'
---
 docs/development/adding_a_new_op.md           |  144 ++-
 mace/benchmark/benchmark_model.cc             |    2 +-
 mace/codegen/BUILD                            |    1 +
 mace/core/future.h                            |    7 +-
 mace/core/net.cc                              |  171 +--
 mace/core/net.h                               |   47 +-
 .../{op_kernel_context.cc => op_context.cc}   |   24 +-
 .../{op_kernel_context.h => op_context.h}     |   20 +-
 mace/core/op_def_registry.cc                  |   77 ++
 mace/core/op_def_registry.h                   |   81 ++
 mace/core/operator.cc                         |   90 +-
 mace/core/operator.h                          |  163 +--
 mace/core/runtime/cpu/cpu_runtime.cc          |   29 +-
 mace/core/runtime/cpu/cpu_runtime.h           |    8 +
 mace/core/runtime/opencl/gpu_device.cc        |    2 +-
 .../src/main/cpp/image_classify.cc            |   12 +-
 mace/examples/cli/example.cc                  |    2 +-
 mace/kernels/BUILD                            |   17 +-
 mace/kernels/activation.cc                    |  118 ++
 mace/kernels/activation.h                     |   73 +-
 mace/kernels/addn.cc                          |  146 +++
 mace/kernels/addn.h                           |  118 --
 mace/kernels/{argmax.h => argmax.cc}          |   39 +-
 mace/kernels/batch_norm.cc                    |  209 +++
 mace/kernels/batch_norm.h                     |  159 ---
 .../{batch_to_space.h => batch_to_space.cc}   |  140 +-
 mace/kernels/{bias_add.h => bias_add.cc}      |  112 +-
 mace/kernels/buffer_inverse_transform.cc      |   67 +
 mace/kernels/buffer_inverse_transform.h       |   82 --
 mace/kernels/buffer_transform.cc              |   67 +
 mace/kernels/buffer_transform.h               |   81 --
 mace/{ops/cast.h => kernels/cast.cc}          |   30 +-
 .../{channel_shuffle.h => channel_shuffle.cc} |   87 +-
 mace/kernels/{concat.h => concat.cc}          |  161 ++-
 mace/kernels/{conv_2d.h => conv_2d.cc}        |  829 ++++++------
 mace/{ops => kernels}/conv_pool_2d_base.h     |   25 +-
 mace/kernels/conv_pool_2d_util.cc             |    6 +-
 mace/kernels/{crop.h => crop.cc}              |  156 +--
 mace/kernels/deconv_2d.cc                     |  561 ++++++++
 mace/kernels/deconv_2d.h                      |  493 --------
 .../{depth_to_space.h => depth_to_space.cc}   |   83 +-
 ...depthwise_conv2d.h => depthwise_conv2d.cc} |  464 ++++---
 mace/kernels/eltwise.cc                       | 1125 +++++++++++++++++
 mace/kernels/eltwise.h                        | 1078 +---------------
 .../kernels/{expand_dims.h => expand_dims.cc} |   49 +-
 mace/kernels/{fill.h => fill.cc}              |   43 +-
 mace/kernels/fully_connected.cc               |  233 ++++
 mace/kernels/fully_connected.h                |  186 ---
 mace/kernels/{gather.h => gather.cc}          |   59 +-
 mace/kernels/gemmlowp_util.h                  |    3 +-
 mace/kernels/identity.cc                      |   50 +
 .../infer_conv2d_shape.cc}                    |   56 +-
 ...response_norm.h => local_response_norm.cc} |   75 +-
 mace/{ops/lstmcell.h => kernels/lstm_cell.cc} |   59 +-
 mace/kernels/lstmcell.h                       |   70 -
 mace/kernels/{matmul.h => matmul.cc}          |  255 ++--
 mace/kernels/opencl/activation.cc             |   46 -
 mace/kernels/opencl/activation.h              |   40 +
 mace/kernels/opencl/addn.cc                   |   45 -
 mace/kernels/opencl/addn.h                    |   42 +
 mace/kernels/opencl/batch_norm.cc             |   53 -
 mace/kernels/opencl/batch_norm.h              |   43 +
 mace/kernels/opencl/batch_to_space.cc         |   51 -
 .../{reshape.h => opencl/batch_to_space.h}    |   39 +-
 mace/kernels/opencl/bias_add.cc               |   46 -
 mace/kernels/opencl/bias_add.h                |   40 +
 .../opencl/buffer/buffer_inverse_transform.h  |   24 +-
 .../kernels/opencl/buffer/buffer_transform.cc |   29 +-
 mace/kernels/opencl/buffer/buffer_transform.h |   48 +-
 .../opencl/buffer/buffer_type_transform.cc    |   16 +-
 mace/kernels/opencl/buffer/conv_2d.h          |   18 +-
 mace/kernels/opencl/buffer/conv_2d_1x1.cc     |    7 +-
 mace/kernels/opencl/buffer/conv_2d_general.cc |    7 +-
 .../kernels/opencl/buffer/depthwise_conv2d.cc |    4 +-
 mace/kernels/opencl/buffer/depthwise_conv2d.h |   16 +-
 mace/kernels/opencl/buffer/pooling.h          |   16 +-
 mace/kernels/opencl/buffer/softmax.h          |   18 +-
 mace/kernels/opencl/buffer/utils.cc           |    4 +-
 mace/kernels/opencl/buffer/utils.h            |    4 +-
 .../opencl/buffer_inverse_transform.cc        |   49 -
 .../kernels/opencl/buffer_inverse_transform.h |   41 +
 mace/kernels/opencl/buffer_transform.cc       |   48 -
 mace/kernels/opencl/buffer_transform.h        |   41 +
 mace/kernels/opencl/channel_shuffle.cc        |   41 -
 mace/kernels/opencl/channel_shuffle.h         |   39 +
 mace/kernels/opencl/concat.cc                 |   45 -
 mace/kernels/opencl/concat.h                  |   41 +
 mace/kernels/opencl/conv_2d.cc                |   62 -
 mace/kernels/opencl/conv_2d.h                 |   47 +
 mace/kernels/opencl/crop.cc                   |   45 -
 mace/kernels/opencl/crop.h                    |   41 +
 mace/kernels/opencl/deconv_2d.cc              |   94 --
 mace/kernels/opencl/deconv_2d.h               |   46 +
 mace/kernels/opencl/depth_to_space.cc         |   42 -
 mace/kernels/opencl/depth_to_space.h          |   39 +
 mace/kernels/opencl/depthwise_conv2d.cc       |   61 -
 mace/kernels/opencl/depthwise_conv2d.h        |   48 +
 mace/kernels/opencl/eltwise.cc                |   48 -
 mace/kernels/opencl/eltwise.h                 |   40 +
 mace/kernels/opencl/fully_connected.cc        |   50 -
 mace/kernels/opencl/fully_connected.h         |   45 +
 mace/kernels/opencl/image/activation.h        |   19 +-
 mace/kernels/opencl/image/addn.h              |   18 +-
 mace/kernels/opencl/image/batch_norm.h        |   39 +-
 mace/kernels/opencl/image/batch_to_space.h    |   18 +-
 mace/kernels/opencl/image/bias_add.h          |   20 +-
 mace/kernels/opencl/image/buffer_to_image.h   |   21 +-
 mace/kernels/opencl/image/channel_shuffle.h   |   21 +-
 mace/kernels/opencl/image/concat.cc           |   18 +-
 mace/kernels/opencl/image/concat.h            |   27 +-
 mace/kernels/opencl/image/conv_2d.h           |   33 +-
 mace/kernels/opencl/image/conv_2d_1x1.cc      |   11 +-
 mace/kernels/opencl/image/conv_2d_3x3.cc      |   10 +-
 mace/kernels/opencl/image/conv_2d_general.cc  |   12 +-
 mace/kernels/opencl/image/crop.h              |   18 +-
 mace/kernels/opencl/image/deconv_2d.h         |   18 +-
 mace/kernels/opencl/image/depth_to_space.h    |   18 +-
 mace/kernels/opencl/image/depthwise_conv2d.cc |    7 +-
 mace/kernels/opencl/image/depthwise_conv2d.h  |   19 +-
 mace/kernels/opencl/image/eltwise.h           |   19 +-
 mace/kernels/opencl/image/fully_connected.h   |   20 +-
 mace/kernels/opencl/image/image_to_buffer.h   |   19 +-
 mace/kernels/opencl/image/lstm_cell.h         |   19 +-
 mace/kernels/opencl/image/matmul.h            |   19 +-
 mace/kernels/opencl/image/pad.h               |   18 +-
 mace/kernels/opencl/image/pooling.h           |   18 +-
 mace/kernels/opencl/image/reduce_mean.h       |   20 +-
 mace/kernels/opencl/image/resize_bicubic.h    |   29 +-
 mace/kernels/opencl/image/resize_bilinear.h   |   27 +-
 mace/kernels/opencl/image/softmax.h           |   18 +-
 mace/kernels/opencl/image/space_to_batch.h    |   18 +-
 mace/kernels/opencl/image/space_to_depth.h    |   18 +-
 mace/kernels/opencl/image/split.h             |   22 +-
 mace/kernels/opencl/image/sqrdiff_mean.h      |   22 +-
 .../kernels/opencl/image/winograd_transform.h |   42 +-
 mace/kernels/opencl/lstm_cell.h               |   44 +
 mace/kernels/opencl/lstmcell.cc               |   51 -
 mace/kernels/opencl/matmul.cc                 |   46 -
 mace/kernels/opencl/matmul.h                  |   42 +
 .../kernels/opencl/out_of_range_check_test.cc |   36 +-
 mace/kernels/opencl/pad.cc                    |   45 -
 mace/kernels/opencl/pad.h                     |   38 +
 mace/kernels/opencl/pooling.cc                |   60 -
 mace/kernels/opencl/pooling.h                 |   46 +
 mace/kernels/opencl/reduce_mean.cc            |   44 -
 mace/kernels/opencl/reduce_mean.h             |   39 +
 mace/kernels/opencl/resize_bicubic.cc         |   47 -
 mace/kernels/opencl/resize_bicubic.h          |   39 +
 mace/kernels/opencl/resize_bilinear.cc        |   45 -
 mace/kernels/opencl/resize_bilinear.h         |   39 +
 mace/kernels/opencl/softmax.cc                |   43 -
 mace/kernels/opencl/softmax.h                 |   39 +
 mace/kernels/opencl/space_to_batch.cc         |   52 -
 .../opencl/space_to_batch.h}                  |   42 +-
 mace/kernels/opencl/space_to_depth.cc         |   43 -
 mace/kernels/opencl/space_to_depth.h          |   39 +
 mace/kernels/opencl/split.cc                  |   44 -
 mace/kernels/opencl/split.h                   |   41 +
 mace/kernels/opencl/sqrdiff_mean.cc           |   43 -
 mace/kernels/opencl/sqrdiff_mean.h            |   39 +
 mace/kernels/opencl/winograd_transform.cc     |   68 -
 mace/kernels/opencl/winograd_transform.h      |   50 +
 mace/kernels/ops_register.cc                  |  132 ++
 mace/{ops => kernels}/ops_register.h          |   12 +-
 mace/kernels/pad.cc                           |  130 ++
 mace/kernels/pad.h                            |  114 --
 mace/kernels/pooling.cc                       |  467 +++++++
 mace/kernels/pooling.h                        |  464 -------
 mace/kernels/proposal.h                       |  301 -----
 mace/kernels/{quantize.h => quantize.cc}      |   73 +-
 .../kernels/{reduce_mean.h => reduce_mean.cc} |  126 +-
 mace/{ops/reshape.h => kernels/reshape.cc}    |   41 +-
 mace/kernels/resize_bicubic.cc                |  234 ++++
 mace/kernels/resize_bicubic.h                 |  197 +--
 mace/kernels/resize_bilinear.cc               |  371 ++++++
 mace/kernels/resize_bilinear.h                |  316 +----
 mace/kernels/{reverse.h => reverse.cc}        |   50 +-
 .../kernels/{scalar_math.h => scalar_math.cc} |   58 +-
 mace/{ops/shape.h => kernels/shape.cc}        |   36 +-
 mace/kernels/{softmax.h => softmax.cc}        |  116 +-
 .../{space_to_batch.h => space_to_batch.cc}   |  138 +-
 .../{space_to_depth.h => space_to_depth.cc}   |   83 +-
 mace/kernels/{split.h => split.cc}            |   97 +-
 .../{sqrdiff_mean.h => sqrdiff_mean.cc}       |  115 +-
 mace/{ops/squeeze.h => kernels/squeeze.cc}    |   44 +-
 mace/kernels/{stack.h => stack.cc}            |   47 +-
 .../{strided_slice.h => strided_slice.cc}     |   94 +-
 mace/kernels/{transpose.h => transpose.cc}    |   48 +-
 mace/kernels/{unstack.h => unstack.cc}        |   39 +-
 mace/kernels/winograd_transform.cc            |  102 ++
 mace/kernels/winograd_transform.h             |   89 --
 mace/libmace/BUILD                            |   11 +-
 mace/libmace/mace.cc                          |   37 +-
 mace/ops/BUILD                                |   32 +-
 mace/ops/activation.cc                        |   43 -
 mace/ops/activation.h                         |   55 -
 mace/ops/activation_benchmark.cc              |    2 +-
 mace/ops/activation_test.cc                   |    2 +-
 mace/ops/addn.cc                              |   43 -
 mace/ops/addn.h                               |   55 -
 mace/ops/addn_benchmark.cc                    |    2 +-
 mace/ops/addn_test.cc                         |    2 +-
 mace/ops/argmax.cc                            |   29 -
 mace/ops/argmax.h                             |   49 -
 mace/ops/argmax_test.cc                       |    2 +-
 mace/ops/batch_norm.cc                        |   43 -
 mace/ops/batch_norm.h                         |   71 --
 mace/ops/batch_norm_benchmark.cc              |    2 +-
 mace/ops/batch_norm_test.cc                   |    2 +-
 mace/ops/batch_to_space.cc                    |   46 -
 mace/ops/batch_to_space.h                     |   53 -
 mace/ops/batch_to_space_benchmark.cc          |    2 +-
 mace/ops/bias_add.cc                          |   43 -
 mace/ops/bias_add.h                           |   57 -
 mace/ops/bias_add_benchmark.cc                |    2 +-
 mace/ops/bias_add_test.cc                     |    2 +-
 mace/ops/buffer_inverse_transform.cc          |   35 -
 mace/ops/buffer_inverse_transform.h           |   53 -
 mace/ops/buffer_to_image_benchmark.cc         |    2 +-
 mace/ops/buffer_transform.cc                  |   35 -
 mace/ops/buffer_transform.h                   |   53 -
 mace/ops/cast.cc                              |   34 -
 mace/ops/cast_test.cc                         |    2 +-
 mace/ops/channel_shuffle.cc                   |   43 -
 mace/ops/channel_shuffle.h                    |   63 -
 mace/ops/channel_shuffle_benchmark.cc         |    2 +-
 mace/ops/channel_shuffle_test.cc              |    2 +-
 mace/ops/concat.cc                            |   52 -
 mace/ops/concat.h                             |   60 -
 mace/ops/concat_benchmark.cc                  |    2 +-
 mace/ops/concat_test.cc                       |    1 -
 mace/ops/conv_2d.cc                           |   49 -
 mace/ops/conv_2d.h                            |   62 -
 mace/ops/conv_2d_benchmark.cc                 |    4 +-
 mace/ops/conv_2d_test.cc                      |    3 +-
 mace/ops/core_test.cc                         |   23 +-
 mace/ops/crop.cc                              |   42 -
 mace/ops/crop.h                               |   50 -
 mace/ops/crop_benchmark.cc                    |    2 +-
 mace/ops/crop_test.cc                         |    2 +-
 mace/ops/deconv_2d.cc                         |   43 -
 mace/ops/deconv_2d.h                          |   76 --
 mace/ops/deconv_2d_benchmark.cc               |    4 +-
 mace/ops/deconv_2d_test.cc                    |    3 +-
 mace/ops/depth_to_space.cc                    |   43 -
 mace/ops/depth_to_space.h                     |   55 -
 mace/ops/depth_to_space_benchmark.cc          |    2 +-
 mace/ops/depth_to_space_test.cc               |    2 +-
 mace/ops/depthwise_conv2d.cc                  |   49 -
 mace/ops/depthwise_conv2d.h                   |   66 -
 mace/ops/depthwise_conv2d_benchmark.cc        |    4 +-
 mace/ops/depthwise_conv2d_test.cc             |    2 +-
 mace/ops/eltwise.cc                           |   53 -
 mace/ops/eltwise.h                            |   56 -
 mace/ops/eltwise_benchmark.cc                 |    2 +-
 mace/ops/eltwise_test.cc                      |    2 +-
 mace/ops/expand_dims.cc                       |   39 -
 mace/ops/expand_dims.h                        |   50 -
 mace/ops/expand_dims_test.cc                  |    2 +-
 mace/ops/fill.cc                              |   29 -
 mace/ops/fill.h                               |   50 -
 mace/ops/fill_test.cc                         |    2 +-
 mace/ops/folded_batch_norm.cc                 |   43 -
 mace/ops/folded_batch_norm.h                  |   67 -
 mace/ops/folded_batch_norm_test.cc            |   22 +-
 mace/ops/fully_connected.cc                   |   48 -
 mace/ops/fully_connected.h                    |   79 --
 mace/ops/fully_connected_benchmark.cc         |    2 +-
 mace/ops/fully_connected_test.cc              |    3 +-
 mace/ops/gather.cc                            |   29 -
 mace/ops/gather.h                             |   52 -
 mace/ops/gather_benchmark.cc                  |    3 +-
 mace/ops/gather_test.cc                       |    2 +-
 mace/ops/identity.cc                          |   48 -
 mace/ops/identity_test.cc                     |    2 +-
 mace/ops/infer_conv2d_shape.cc                |   46 -
 mace/ops/infer_conv2d_shape_test.cc           |    2 +-
 mace/ops/local_response_norm.cc               |   29 -
 mace/ops/local_response_norm.h                |   62 -
 mace/ops/local_response_norm_benchmark.cc     |    2 +-
 mace/ops/local_response_norm_test.cc          |    2 +-
 mace/ops/lstmcell.cc                          |   35 -
 mace/ops/lstmcell_benchmark.cc                |    2 +-
 mace/ops/lstmcell_test.cc                     |    2 +-
 mace/ops/lstmcell_test_util.h                 |    2 +-
 mace/ops/matmul.cc                            |   49 -
 mace/ops/matmul.h                             |   70 -
 mace/ops/matmul_benchmark.cc                  |    2 +-
 mace/ops/matmul_test.cc                       |    2 +-
 mace/ops/ops_def_register.cc                  |  373 ++++++
 .../kernel.h => ops/ops_def_register.h}       |   19 +-
 mace/ops/ops_register.cc                      |  136 --
 mace/ops/ops_test_util.h                      |   53 +-
 mace/ops/pad.cc                               |   42 -
 mace/ops/pad.h                                |   48 -
 mace/ops/pad_benchmark.cc                     |    2 +-
 mace/ops/pad_test.cc                          |    2 +-
 mace/ops/pooling.cc                           |   48 -
 mace/ops/pooling.h                            |   63 -
 mace/ops/pooling_benchmark.cc                 |    4 +-
 mace/ops/pooling_test.cc                      |    5 +-
 mace/ops/proposal.cc                          |   29 -
 mace/ops/proposal.h                           |   60 -
 mace/ops/proposal_test.cc                     |   71 --
 mace/ops/quantize.cc                          |   37 -
 mace/ops/quantize.h                           |   76 --
 mace/ops/quantize_test.cc                     |    2 +-
 mace/ops/reduce_mean.cc                       |   42 -
 mace/ops/reduce_mean.h                        |   63 -
 mace/ops/reduce_mean_benchmark.cc             |    2 +-
 mace/ops/reduce_mean_test.cc                  |    2 +-
 mace/ops/reshape.cc                           |   49 -
 mace/ops/reshape_test.cc                      |    2 +-
 mace/ops/resize_bicubic.cc                    |   43 -
 mace/ops/resize_bicubic.h                     |   51 -
 mace/ops/resize_bicubic_benchmark.cc          |    2 +-
 mace/ops/resize_bicubic_test.cc               |    3 +-
 mace/ops/resize_bilinear.cc                   |   49 -
 mace/ops/resize_bilinear.h                    |   50 -
 mace/ops/resize_bilinear_benchmark.cc         |    2 +-
 mace/ops/resize_bilinear_test.cc              |    3 +-
 mace/ops/reverse.cc                           |   29 -
 mace/ops/reverse.h                            |   50 -
 mace/ops/reverse_benchmark.cc                 |    2 +-
 mace/ops/reverse_test.cc                      |    2 +-
 mace/ops/scalar_math.cc                       |   44 -
 mace/ops/scalar_math.h                        |   53 -
 mace/ops/scalar_math_test.cc                  |    2 +-
 mace/ops/shape.cc                             |   42 -
 mace/ops/shape_test.cc                        |    2 +-
 mace/ops/softmax.cc                           |   48 -
 mace/ops/softmax.h                            |   51 -
 mace/ops/softmax_benchmark.cc                 |    2 +-
 mace/ops/softmax_test.cc                      |    2 +-
 mace/ops/space_to_batch.cc                    |   47 -
 mace/ops/space_to_batch.h                     |   53 -
 mace/ops/space_to_batch_benchmark.cc          |    2 +-
 mace/ops/space_to_depth.cc                    |   43 -
 mace/ops/space_to_depth.h                     |   53 -
 mace/ops/space_to_depth_benchmark.cc          |    2 +-
 mace/ops/space_to_depth_test.cc               |    2 +-
 mace/ops/split.cc                             |   43 -
 mace/ops/split.h                              |   55 -
 mace/ops/split_benchmark.cc                   |    2 +-
 mace/ops/split_test.cc                        |    1 -
 mace/ops/sqrdiff_mean.cc                      |   42 -
 mace/ops/sqrdiff_mean.h                       |   53 -
 mace/ops/squeeze.cc                           |   48 -
 mace/ops/squeeze_test.cc                      |    2 +-
 mace/ops/stack.cc                             |   44 -
 mace/ops/stack.h                              |   49 -
 mace/ops/stack_test.cc                        |    2 +-
 mace/ops/strided_slice.cc                     |   44 -
 mace/ops/strided_slice.h                      |   62 -
 mace/ops/strided_slice_test.cc                |    2 +-
 mace/ops/transpose.cc                         |   29 -
 mace/ops/transpose.h                          |   59 -
 mace/ops/transpose_benchmark.cc               |    2 +-
 mace/ops/transpose_test.cc                    |    2 +-
 mace/ops/unstack.cc                           |   34 -
 mace/ops/unstack.h                            |   49 -
 mace/ops/unstack_test.cc                      |    2 +-
 mace/ops/winograd_convolution_benchmark.cc    |    2 +-
 mace/ops/winograd_convolution_test.cc         |   28 +-
 mace/ops/winograd_inverse_transform.cc        |   39 -
 mace/ops/winograd_inverse_transform.h         |   58 -
 mace/ops/winograd_transform.cc                |   39 -
 mace/ops/winograd_transform.h                 |   56 -
 mace/ops/winograd_transform_benchmark.cc      |    2 +-
 mace/proto/mace.proto                         |   10 +-
 mace/public/mace.h                            |   54 +-
 mace/python/tools/converter.py                |    3 +
 .../tools/converter_tool/base_converter.py    |    1 -
 .../tools/converter_tool/caffe_converter.py   |    2 +-
 .../tools/converter_tool/shape_inference.py   |    2 +-
 .../converter_tool/tensorflow_converter.py    |    2 +-
 .../tools/converter_tool/transformer.py       |   22 +-
 .../python/tools/mace_engine_factory.h.jinja2 |    2 +-
 mace/python/tools/model.jinja2                |    9 +
 mace/test/mace_api_mt_test.cc                 |    6 +-
 mace/test/mace_api_test.cc                    |    6 +-
 mace/tools/validation/mace_run.cc             |   18 +-
 mace/utils/BUILD                              |    1 +
 mace/utils/status.cc                          |   88 ++
 mace/utils/utils.h                            |    4 +-
 385 files changed, 9125 insertions(+), 12909 deletions(-)
 rename mace/core/{op_kernel_context.cc => op_context.cc} (61%)
 rename mace/core/{op_kernel_context.h => op_context.h} (70%)
 create mode 100644 mace/core/op_def_registry.cc
 create mode 100644 mace/core/op_def_registry.h
 create mode 100644 mace/kernels/activation.cc
 create mode 100644 mace/kernels/addn.cc
 delete mode 100644 mace/kernels/addn.h
 rename mace/kernels/{argmax.h => argmax.cc} (77%)
 create mode 100644 mace/kernels/batch_norm.cc
 delete mode 100644 mace/kernels/batch_norm.h
 rename mace/kernels/{batch_to_space.h => batch_to_space.cc} (77%)
 rename mace/kernels/{bias_add.h => bias_add.cc} (50%)
 create mode 100644 mace/kernels/buffer_inverse_transform.cc
 delete mode 100644 mace/kernels/buffer_inverse_transform.h
 create mode 100644 mace/kernels/buffer_transform.cc
 delete mode 100644 mace/kernels/buffer_transform.h
 rename mace/{ops/cast.h => kernels/cast.cc} (74%)
 rename mace/kernels/{channel_shuffle.h => channel_shuffle.cc} (50%)
 rename mace/kernels/{concat.h => concat.cc} (54%)
 rename mace/kernels/{conv_2d.h => conv_2d.cc} (83%)
 rename mace/{ops => kernels}/conv_pool_2d_base.h (59%)
 rename mace/kernels/{crop.h => crop.cc} (63%)
 create mode 100644 mace/kernels/deconv_2d.cc
 rename mace/kernels/{depth_to_space.h => depth_to_space.cc} (62%)
 rename mace/kernels/{depthwise_conv2d.h => depthwise_conv2d.cc} (74%)
 create mode 100644 mace/kernels/eltwise.cc
 rename mace/kernels/{expand_dims.h => expand_dims.cc} (62%)
 rename mace/kernels/{fill.h => fill.cc} (69%)
 create mode 100644 mace/kernels/fully_connected.cc
 delete mode 100644 mace/kernels/fully_connected.h
 rename mace/kernels/{gather.h => gather.cc} (76%)
 create mode 100644 mace/kernels/identity.cc
 rename mace/{ops/infer_conv2d_shape.h => kernels/infer_conv2d_shape.cc} (68%)
 rename mace/kernels/{local_response_norm.h => local_response_norm.cc} (56%)
 rename mace/{ops/lstmcell.h => kernels/lstm_cell.cc} (50%)
 delete mode 100644 mace/kernels/lstmcell.h
 rename mace/kernels/{matmul.h => matmul.cc} (57%)
 delete mode 100644 mace/kernels/opencl/activation.cc
 create mode 100644 mace/kernels/opencl/activation.h
 delete mode 100644 mace/kernels/opencl/addn.cc
 create mode 100644 mace/kernels/opencl/addn.h
 delete mode 100644 mace/kernels/opencl/batch_norm.cc
 create mode 100644 mace/kernels/opencl/batch_norm.h
 delete mode 100644 mace/kernels/opencl/batch_to_space.cc
 rename mace/kernels/{reshape.h => opencl/batch_to_space.h} (52%)
 delete mode 100644 mace/kernels/opencl/bias_add.cc
 create mode 100644 mace/kernels/opencl/bias_add.h
 delete mode 100644 mace/kernels/opencl/buffer_inverse_transform.cc
 create mode 100644 mace/kernels/opencl/buffer_inverse_transform.h
 delete mode 100644 mace/kernels/opencl/buffer_transform.cc
 create mode 100644 mace/kernels/opencl/buffer_transform.h
 delete mode 100644 mace/kernels/opencl/channel_shuffle.cc
 create mode 100644 mace/kernels/opencl/channel_shuffle.h
 delete mode 100644 mace/kernels/opencl/concat.cc
 create mode 100644 mace/kernels/opencl/concat.h
 delete mode 100644 mace/kernels/opencl/conv_2d.cc
 create mode 100644 mace/kernels/opencl/conv_2d.h
 delete mode 100644 mace/kernels/opencl/crop.cc
 create mode 100644 mace/kernels/opencl/crop.h
 delete mode 100644 mace/kernels/opencl/deconv_2d.cc
 create mode 100644 mace/kernels/opencl/deconv_2d.h
 delete mode 100644 mace/kernels/opencl/depth_to_space.cc
 create mode 100644 mace/kernels/opencl/depth_to_space.h
 delete mode 100644 mace/kernels/opencl/depthwise_conv2d.cc
 create mode 100644 mace/kernels/opencl/depthwise_conv2d.h
 delete mode 100644 mace/kernels/opencl/eltwise.cc
 create mode 100644 mace/kernels/opencl/eltwise.h
 delete mode 100644 mace/kernels/opencl/fully_connected.cc
 create mode 100644 mace/kernels/opencl/fully_connected.h
 create mode 100644 mace/kernels/opencl/lstm_cell.h
 delete mode 100644 mace/kernels/opencl/lstmcell.cc
 delete mode 100644 mace/kernels/opencl/matmul.cc
 create mode 100644 mace/kernels/opencl/matmul.h
 delete mode 100644 mace/kernels/opencl/pad.cc
 create mode 100644 mace/kernels/opencl/pad.h
 delete mode 100644 mace/kernels/opencl/pooling.cc
 create mode 100644 mace/kernels/opencl/pooling.h
 delete mode 100644 mace/kernels/opencl/reduce_mean.cc
 create mode 100644 mace/kernels/opencl/reduce_mean.h
 delete mode 100644 mace/kernels/opencl/resize_bicubic.cc
 create mode 100644 mace/kernels/opencl/resize_bicubic.h
 delete mode 100644 mace/kernels/opencl/resize_bilinear.cc
 create mode 100644 mace/kernels/opencl/resize_bilinear.h
 delete mode 100644 mace/kernels/opencl/softmax.cc
 create mode 100644 mace/kernels/opencl/softmax.h
 delete mode 100644 mace/kernels/opencl/space_to_batch.cc
 rename mace/{ops/identity.h => kernels/opencl/space_to_batch.h} (51%)
 delete mode 100644 mace/kernels/opencl/space_to_depth.cc
 create mode 100644 mace/kernels/opencl/space_to_depth.h
 delete mode 100644 mace/kernels/opencl/split.cc
 create mode 100644 mace/kernels/opencl/split.h
 delete mode 100644 mace/kernels/opencl/sqrdiff_mean.cc
 create mode 100644 mace/kernels/opencl/sqrdiff_mean.h
 delete mode 100644 mace/kernels/opencl/winograd_transform.cc
 create mode 100644 mace/kernels/opencl/winograd_transform.h
 create mode 100644 mace/kernels/ops_register.cc
 rename mace/{ops => kernels}/ops_register.h (76%)
 create mode 100644 mace/kernels/pad.cc
 delete mode 100644 mace/kernels/pad.h
 create mode 100644 mace/kernels/pooling.cc
 delete mode 100644 mace/kernels/proposal.h
 rename mace/kernels/{quantize.h => quantize.cc} (58%)
 rename mace/kernels/{reduce_mean.h => reduce_mean.cc} (74%)
 rename mace/{ops/reshape.h => kernels/reshape.cc} (68%)
 create mode 100644 mace/kernels/resize_bicubic.cc
 create mode 100644 mace/kernels/resize_bilinear.cc
 rename mace/kernels/{reverse.h => reverse.cc} (73%)
 rename mace/kernels/{scalar_math.h => scalar_math.cc} (72%)
 rename mace/{ops/shape.h => kernels/shape.cc} (70%)
 rename mace/kernels/{softmax.h => softmax.cc} (86%)
 rename mace/kernels/{space_to_batch.h => space_to_batch.cc} (79%)
 rename mace/kernels/{space_to_depth.h => space_to_depth.cc} (60%)
 rename mace/kernels/{split.h => split.cc} (55%)
 rename mace/kernels/{sqrdiff_mean.h => sqrdiff_mean.cc} (58%)
 rename mace/{ops/squeeze.h => kernels/squeeze.cc} (57%)
 rename mace/kernels/{stack.h => stack.cc} (68%)
 rename mace/kernels/{strided_slice.h => strided_slice.cc} (77%)
 rename mace/kernels/{transpose.h => transpose.cc} (88%)
 rename mace/kernels/{unstack.h => unstack.cc} (76%)
 create mode 100644 mace/kernels/winograd_transform.cc
 delete mode 100644 mace/kernels/winograd_transform.h
 delete mode 100644 mace/ops/activation.cc
 delete mode 100644 mace/ops/activation.h
 delete mode 100644 mace/ops/addn.cc
 delete mode 100644 mace/ops/addn.h
 delete mode 100644 mace/ops/argmax.cc
 delete mode 100644 mace/ops/argmax.h
 delete mode 100644 mace/ops/batch_norm.cc
 delete mode 100644 mace/ops/batch_norm.h
 delete mode 100644 mace/ops/batch_to_space.cc
 delete mode 100644 mace/ops/batch_to_space.h
 delete mode 100644 mace/ops/bias_add.cc
 delete mode 100644 mace/ops/bias_add.h
 delete mode 100644 mace/ops/buffer_inverse_transform.cc
 delete mode 100644 mace/ops/buffer_inverse_transform.h
 delete mode 100644 mace/ops/buffer_transform.cc
 delete mode 100644 mace/ops/buffer_transform.h
 delete mode 100644 mace/ops/cast.cc
 delete mode 100644 mace/ops/channel_shuffle.cc
 delete mode 100644 mace/ops/channel_shuffle.h
 delete mode 100644 mace/ops/concat.cc
 delete mode 100644 mace/ops/concat.h
 delete mode 100644 mace/ops/conv_2d.cc
 delete mode 100644 mace/ops/conv_2d.h
 delete mode 100644 mace/ops/crop.cc
 delete mode 100644 mace/ops/crop.h
 delete mode 100644 mace/ops/deconv_2d.cc
 delete mode 100644 mace/ops/deconv_2d.h
 delete mode 100644 mace/ops/depth_to_space.cc
 delete mode 100644 mace/ops/depth_to_space.h
 delete mode 100644 mace/ops/depthwise_conv2d.cc
 delete mode 100644 mace/ops/depthwise_conv2d.h
 delete mode 100644 mace/ops/eltwise.cc
 delete mode 100644 mace/ops/eltwise.h
 delete mode 100644 mace/ops/expand_dims.cc
 delete mode 100644 mace/ops/expand_dims.h
 delete mode 100644 mace/ops/fill.cc
 delete mode 100644 mace/ops/fill.h
 delete mode 100644 mace/ops/folded_batch_norm.cc
 delete mode 100644 mace/ops/folded_batch_norm.h
 delete mode 100644 mace/ops/fully_connected.cc
 delete mode 100644 mace/ops/fully_connected.h
 delete mode 100644 mace/ops/gather.cc
 delete mode 100644 mace/ops/gather.h
 delete mode 100644 mace/ops/identity.cc
 delete mode 100644 mace/ops/infer_conv2d_shape.cc
 delete mode 100644 mace/ops/local_response_norm.cc
 delete mode 100644 mace/ops/local_response_norm.h
 delete mode 100644 mace/ops/lstmcell.cc
 delete mode 100644 mace/ops/matmul.cc
 delete mode 100644 mace/ops/matmul.h
 create mode 100644 mace/ops/ops_def_register.cc
 rename mace/{kernels/kernel.h => ops/ops_def_register.h} (68%)
 delete mode 100644 mace/ops/ops_register.cc
 delete mode 100644 mace/ops/pad.cc
 delete mode 100644 mace/ops/pad.h
 delete mode 100644 mace/ops/pooling.cc
 delete mode 100644 mace/ops/pooling.h
 delete mode 100644 mace/ops/proposal.cc
 delete mode 100644 mace/ops/proposal.h
 delete mode 100644 mace/ops/proposal_test.cc
 delete mode 100644 mace/ops/quantize.cc
 delete mode 100644 mace/ops/quantize.h
 delete mode 100644 mace/ops/reduce_mean.cc
 delete mode 100644 mace/ops/reduce_mean.h
 delete mode 100644 mace/ops/reshape.cc
 delete mode 100644 mace/ops/resize_bicubic.cc
 delete mode 100644 mace/ops/resize_bicubic.h
 delete mode 100644 mace/ops/resize_bilinear.cc
 delete mode 100644 mace/ops/resize_bilinear.h
 delete mode 100644 mace/ops/reverse.cc
 delete mode 100644 mace/ops/reverse.h
 delete mode 100644 mace/ops/scalar_math.cc
 delete mode 100644 mace/ops/scalar_math.h
 delete mode 100644 mace/ops/shape.cc
 delete mode 100644 mace/ops/softmax.cc
 delete mode 100644 mace/ops/softmax.h
 delete mode 100644 mace/ops/space_to_batch.cc
 delete mode 100644 mace/ops/space_to_batch.h
 delete mode 100644 mace/ops/space_to_depth.cc
 delete mode 100644 mace/ops/space_to_depth.h
 delete mode 100644 mace/ops/split.cc
 delete mode 100644 mace/ops/split.h
 delete mode 100644 mace/ops/sqrdiff_mean.cc
 delete mode 100644 mace/ops/sqrdiff_mean.h
 delete mode 100644 mace/ops/squeeze.cc
 delete mode 100644 mace/ops/stack.cc
 delete mode 100644 mace/ops/stack.h
 delete mode 100644 mace/ops/strided_slice.cc
 delete mode 100644 mace/ops/strided_slice.h
 delete mode 100644 mace/ops/transpose.cc
 delete mode 100644 mace/ops/transpose.h
 delete mode 100644 mace/ops/unstack.cc
 delete mode 100644 mace/ops/unstack.h
 delete mode 100644 mace/ops/winograd_inverse_transform.cc
 delete mode 100644 mace/ops/winograd_inverse_transform.h
 delete mode 100644 mace/ops/winograd_transform.cc
 delete mode 100644 mace/ops/winograd_transform.h
 create mode 100644 mace/utils/status.cc

diff --git a/docs/development/adding_a_new_op.md b/docs/development/adding_a_new_op.md
index 1b1910db..33a1a60d 100644
--- a/docs/development/adding_a_new_op.md
+++ b/docs/development/adding_a_new_op.md
@@ -5,107 +5,114 @@ You can create a custom op if it is not supported yet.
 
 To add a custom op, you need to follow these steps:
 
-Define the Op class
---------------------
-Define the new Op class in `mace/ops/my_custom_op.h`.
-
+Register the new OpDef information
+----------------------------------
+Register the OpDef information about which devices the operation could run on.
+Registry file is in `mace/ops/ops_def_register.cc`
 ```c++
-#ifndef MACE_OPS_MY_CUSTOM_OP_H_
-#define MACE_OPS_MY_CUSTOM_OP_H_
+#include "mace/ops/ops_def_register.h"
+
+namespace mace {
+namespace ops {
+
+void RegisterOpDefs(OpDefRegistryBase *op_def_registry) {
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("MyCustomOp")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+  ......
+}
+}  // namespace ops
+}  // namespace mace
+
+```
 
+Implement the Operation
+-----------------------
+The Best way is to refer to the implementation of other operator(e.g. `/mace/kernels/activation.cc`)
+
+Define the new Op class in `mace/kernels/my_custom_op.cc`.
+1. CPU code: just write the code in `mace/kernels/my_custom_op.cc`.
+2. GPU code: Kernel API is defined in `mace/kernels/my_custom_op.h`, 
+Kernel based on Image is realized in `mace/kernels/opencl/image/my_custom_op.cc`,
+Kernel based on Buffer is realized in `mace/kernels/opencl/buffer/my_custom_op.cc`.
+ 
+The structure like the following code.
+```c++
 #include "mace/core/operator.h"
-#include "mace/kernels/my_custom_op.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
+
+template <DeviceType D, class T>
+class MyCustomOp;
 
-template <DeviceType D, typename T>
-class MyCustomOp : public Operator<D, T> {
- public:
-  MyCustomOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws),
-        functor_() {}
-
-  bool Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-   
-    functor_(input, output, future);
-    return true;
-  }
-
- protected:
-  OP_INPUT_TAGS(INPUT);
-  OP_OUTPUT_TAGS(OUTPUT);
-
- private:
-  kernels::MyCustomOpFunctor<D, T> functor_;
+template <>
+class MyCustomOp<DeviceType::CPU, float> : public Operation {
+...
+}
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class ActivationOp<DeviceType::GPU, T> : public Operation {
+...
 };
+#endif  // MACE_ENABLE_OPENCL
 
 }  // namespace ops
 }  // namespace mace
 
-#endif  // MACE_OPS_MY_CUSTOM_OP_H_
-
 ```
 
-Register the new Op
---------------------
-Define the Ops registering function in `mace/ops/my_custom_op.cc`.
+Register the Operation
+-----------------------
+1, Add register function in `mace/kernels/my_custom_op.cc`
 ```c++
-#include "mace/ops/my_custom_op.h"
+#include "mace/core/operator.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
-void Register_My_Custom_Op(OperatorRegistryBase *op_registry) {
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    Custom_Op<DeviceType::CPU, float>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, float>);
-
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("my_custom_op")
-                                     .Device(DeviceType::OPENCL)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    Custom_Op<DeviceType::OPENCL, half>);
-}
+void RegisterMyCustomOp(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+                   DeviceType::CPU, float);
 
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "MyCustomOp", ActivationOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 }  // namespace ops
 }  // namespace mace
-
 ```
-And then register the new Op in `mace/ops/ops_register.cc`.
+2, And then register the new Op in `mace/kernels/ops_register.cc`.
 ```
-#include "mace/ops/ops_register.h"
+#include "mace/kernels/ops_register.h"
 
 namespace mace {
-
 namespace ops {
 // Keep in lexicographical order
 
 ...
 
-extern void Register_My_Custom_Op(OperatorRegistryBase *op_registry);
+extern void RegisterMyCustomOp(OpRegistryBase *op_registry);
 
 ...
 
 }  // namespace ops
 
 
-OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {
+OpRegistry::OpRegistry() : OpRegistryBase() {
   // Keep in lexicographical order
 
   ...
 
-  ops::Register_My_Custom_Op(this);
+  ops::RegisterMyCustomOp(this);
 
   ...
 
@@ -113,16 +120,13 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {
 
 }  // namespace mace
 ```
+Add UTs
+----------------------
+Add operation unit tests in `mace/ops/my_custom_op_test.cc`
 
-Implement the Op kernel code
-----------------------------
-You need to implement the CPU kernel in a `mace/kernels/my_custom_op.h` and
-optionally OpenCL kernel in `mace/kernels/kernels/my_custom_op_opencl.cc` and
-`mace/kernels/kernels/cl/my_custom_op.cl`. You can also optimize the CPU
-kernel with NEON.
-
-Add test and benchmark
+Add benchmark
 ----------------------
+Add operation benchmark in `mace/ops/my_custom_op_benchmark.cc`
 It's strongly recommended to add unit tests and micro benchmarks for your
 new Op. If you wish to contribute back, it's required.
 
diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc
index 26fb2d0b..7f0afe24 100644
--- a/mace/benchmark/benchmark_model.cc
+++ b/mace/benchmark/benchmark_model.cc
@@ -263,7 +263,7 @@ int Main(int argc, char **argv) {
       FLAGS_omp_num_threads,
       static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
       true);
-  if (mace_status != MACE_SUCCESS) {
+  if (mace_status != MaceStatus::MACE_SUCCESS) {
     LOG(INFO) << "Set openmp or cpu affinity failed.";
   }
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/codegen/BUILD b/mace/codegen/BUILD
index 8a24594c..5122da1d 100644
--- a/mace/codegen/BUILD
+++ b/mace/codegen/BUILD
@@ -50,6 +50,7 @@ cc_library(
     copts = ["-Werror", "-Wextra", "-Wno-missing-field-initializers"],
     deps = [
         "//mace/public",
+        "//mace/utils",
     ],
 )
 
diff --git a/mace/core/future.h b/mace/core/future.h
index f5807f54..6fb82d98 100644
--- a/mace/core/future.h
+++ b/mace/core/future.h
@@ -27,7 +27,12 @@ struct CallStats;
 
 // Wait the call to finish and get the stats if param is not nullptr
 struct StatsFuture {
-  std::function<void(CallStats *)> wait_fn;
+  std::function<void(CallStats *)> wait_fn = [](CallStats *stats) {
+    if (stats != nullptr) {
+      stats->start_micros = NowMicros();
+      stats->end_micros = stats->start_micros;
+    }
+  };
 };
 
 inline void SetFutureDefaultWaitFn(StatsFuture *future) {
diff --git a/mace/core/net.cc b/mace/core/net.cc
index 2766de11..40dc404e 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -16,8 +16,10 @@
 #include <algorithm>
 #include <limits>
 
+#include "mace/core/future.h"
 #include "mace/core/macros.h"
 #include "mace/core/net.h"
+#include "mace/core/op_context.h"
 #include "mace/public/mace.h"
 #include "mace/utils/memory_logging.h"
 #include "mace/utils/timer.h"
@@ -25,39 +27,60 @@
 
 namespace mace {
 
-NetBase::NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
-                 const std::shared_ptr<const NetDef> net_def,
-                 Workspace *ws,
-                 Device *device)
-    : op_registry_(op_registry) {
-  MACE_UNUSED(net_def);
-  MACE_UNUSED(ws);
-  MACE_UNUSED(device);
-}
-
-SerialNet::SerialNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const std::shared_ptr<const NetDef> net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode)
-    : NetBase(op_registry, net_def, ws, device), device_(device),
-      op_kernel_context_(new OpKernelContext(ws, device)) {
+SerialNet::SerialNet(OpDefRegistryBase *op_def_registry,
+                     const OpRegistryBase *op_registry,
+                     const NetDef *net_def,
+                     Workspace *ws,
+                     Device *target_device,
+                     const NetMode mode)
+    : NetBase(),
+      ws_(ws),
+      target_device_(target_device),
+      cpu_device_(
+          new CPUDevice(target_device->cpu_runtime()->num_threads(),
+                        target_device->cpu_runtime()->policy(),
+                        target_device->cpu_runtime()->use_gemmlowp())) {
   MACE_LATENCY_LOGGER(1, "Constructing SerialNet");
-  DeviceType device_type = device->device_type();
+  // Register Operations
+  MaceStatus status;
+  for (int idx = 0; idx < net_def->op_types_size(); ++idx) {
+    status = op_def_registry->Register(net_def->op_types(idx));
+    MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
+  }
+  // Create Operations
+  operators_.clear();
+  const OpRegistrationInfo *info;
+  DeviceType target_device_type = target_device_->device_type();
+  OpConstructContext construct_context(ws_);
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
     const auto &operator_def = net_def->op(idx);
-    // TODO(liuqi): refactor to add device_type to OperatorDef
+    // Create the Operation
     const int op_device =
         ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-            operator_def, "device", static_cast<int>(device_type));
-    if (op_device == device_type) {
-      VLOG(3) << "Creating operator " << operator_def.name() << "("
-              << operator_def.type() << ")";
+            operator_def, "device", static_cast<int>(target_device_type));
+    if (op_device == target_device_type) {
+      // Find op registration information
+      status = op_def_registry->Find(operator_def.type(), &info);
+      MACE_CHECK(status == MaceStatus::MACE_SUCCESS, status.information());
+      // Get available devices (sorted based on priority)
       OperatorDef temp_def(operator_def);
-      std::unique_ptr<OperatorBase> op(
-          op_registry->CreateOperator(temp_def, op_kernel_context_.get(),
-                                      device_type, mode));
+      auto available_devices = info->device_place_func_();
+      // Find the device type to run the op.
+      // If the target_device_type in available devices, use target_device_type,
+      // otherwise, fallback to the first device (top priority).
+      DeviceType device_type = available_devices[0];
+      construct_context.set_device(cpu_device_);
+      for (auto device : available_devices) {
+        if (device == target_device_type) {
+          device_type = target_device_type;
+          construct_context.set_device(target_device_);
+          break;
+        }
+      }
+      temp_def.set_device_type(device_type);
+      construct_context.set_operator_def(&temp_def);
+      std::unique_ptr<Operation> op(
+          op_registry->CreateOperation(&construct_context, device_type, mode));
       if (op) {
         operators_.emplace_back(std::move(op));
       }
@@ -65,38 +88,59 @@ SerialNet::SerialNet(
   }
 }
 
+MaceStatus SerialNet::Init() {
+  // TODO(liuqi): where to do memory reuse.
+  MACE_LATENCY_LOGGER(1, "Initializing SerialNet");
+  OpInitContext init_context(ws_);
+  for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
+    auto &op = *iter;
+    DeviceType device_type = op->device_type();
+    if (device_type == target_device_->device_type()) {
+      init_context.set_device(target_device_);
+    } else {
+      init_context.set_device(cpu_device_);
+    }
+    // Initialize the operation
+    MACE_RETURN_IF_ERROR(op->Init(&init_context));
+  }
+  return MaceStatus::MACE_SUCCESS;
+}
+
 MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
+  // TODO(liuqi): In/Out Buffer Transform
   MACE_MEMORY_LOGGING_GUARD();
   MACE_LATENCY_LOGGER(1, "Running net");
-  const DeviceType device_type = device_->device_type();
+  OpContext context(ws_, cpu_device_);
   for (auto iter = operators_.begin(); iter != operators_.end(); ++iter) {
     auto &op = *iter;
-    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(), "(",
-                        op->debug_def().type(), "), mem_id: ",
+    DeviceType device_type = op->device_type();
+    MACE_LATENCY_LOGGER(2, "Running operator ", op->debug_def().name(),
+                        "<", device_type, ", ", op->debug_def().type(), ">",
+                        ". mem_id: ",
                         MakeListString(op->debug_def().mem_id().data(),
                                        op->debug_def().mem_id().size()));
-    bool future_wait = (device_type == DeviceType::GPU &&
-                        (run_metadata != nullptr ||
-                         std::distance(iter, operators_.end()) == 1));
+    if (device_type == target_device_->device_type()) {
+      context.set_device(target_device_);
+    } else {
+      context.set_device(cpu_device_);
+    }
 
     CallStats call_stats;
-    if (future_wait) {
-      StatsFuture future;
-      MACE_RETURN_IF_ERROR(op->Run(&future));
-      if (run_metadata != nullptr) {
+    if (run_metadata == nullptr) {
+      MACE_RETURN_IF_ERROR(op->Run(&context));
+    } else {
+      if (device_type == DeviceType::CPU) {
+        call_stats.start_micros = NowMicros();
+        MACE_RETURN_IF_ERROR(op->Run(&context));
+        call_stats.end_micros = NowMicros();
+      } else if (device_type == DeviceType::GPU) {
+        StatsFuture future;
+        context.set_future(&future);
+        MACE_RETURN_IF_ERROR(op->Run(&context));
         future.wait_fn(&call_stats);
-      } else {
-        future.wait_fn(nullptr);
       }
-    } else if (run_metadata != nullptr) {
-      call_stats.start_micros = NowMicros();
-      MACE_RETURN_IF_ERROR(op->Run(nullptr));
-      call_stats.end_micros = NowMicros();
-    } else {
-      MACE_RETURN_IF_ERROR(op->Run(nullptr));
-    }
 
-    if (run_metadata != nullptr) {
+      // Record run metadata
       std::vector<int> strides;
       int padding_type = -1;
       std::vector<int> paddings;
@@ -150,19 +194,20 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
                       << "@@" << min_v << "," << max_v;
           }
         } else {
+          const int bin_size = 2048;
           for (int ind = 0; ind < op->debug_def().quantize_info_size(); ++ind) {
             float min_v = op->debug_def().quantize_info(ind).minval();
             float max_v = op->debug_def().quantize_info(ind).maxval();
-            std::vector<int> bin_distribution(kBinSize, 0);
-            float bin_v = (max_v - min_v) / kBinSize;
+            std::vector<int> bin_distribution(bin_size, 0);
+            float bin_v = (max_v - min_v) / bin_size;
             Tensor::MappingGuard guard(op->Output(i));
             const float *output_data = op->Output(i)->data<float>();
             for (index_t j = 0; j < op->Output(i)->size(); ++j) {
                 int ind = static_cast<int>((output_data[j] - min_v) / bin_v);
                 if (ind < 0)
                   ind = 0;
-                else if (ind > kBinSize-1)
-                  ind = kBinSize-1;
+                else if (ind > bin_size-1)
+                  ind = bin_size-1;
                 bin_distribution[ind]++;
             }
             LOG(INFO) << "Tensor range @@" << op->debug_def().output(i)
@@ -174,28 +219,6 @@ MaceStatus SerialNet::Run(RunMetadata *run_metadata) {
     }
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
-
-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const NetDef &net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode) {
-  std::shared_ptr<NetDef> tmp_net_def(new NetDef(net_def));
-  return CreateNet(op_registry, tmp_net_def, ws, device, mode);
-}
-
-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const std::shared_ptr<const NetDef> net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode) {
-  std::unique_ptr<NetBase> net(
-      new SerialNet(op_registry, net_def, ws, device, mode));
-  return net;
-}
-
 }  // namespace mace
diff --git a/mace/core/net.h b/mace/core/net.h
index ecff907b..799e07d4 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -21,64 +21,51 @@
 #include <unordered_map>
 #include <sstream>
 
-#include "mace/core/operator.h"
-#include "mace/utils/string_util.h"
+#include "mace/core/op_def_registry.h"
 
-#define kBinSize 2048
+#include "mace/core/operator.h"
 
 namespace mace {
 
 class RunMetadata;
-class OperatorBase;
 class Workspace;
 
 class NetBase {
  public:
-  NetBase(const std::shared_ptr<const OperatorRegistryBase> op_registry,
-          const std::shared_ptr<const NetDef> net_def,
-          Workspace *ws,
-          Device *device);
-  virtual ~NetBase() noexcept {}
+  NetBase() noexcept = default;
+  virtual ~NetBase() = default;
+
+  virtual MaceStatus Init() = 0;
 
   virtual MaceStatus Run(RunMetadata *run_metadata = nullptr) = 0;
 
  protected:
-  const std::shared_ptr<const OperatorRegistryBase> op_registry_;
-
   MACE_DISABLE_COPY_AND_ASSIGN(NetBase);
 };
 
 class SerialNet : public NetBase {
  public:
-  SerialNet(const std::shared_ptr<const OperatorRegistryBase> op_registry,
-            const std::shared_ptr<const NetDef> net_def,
+  SerialNet(OpDefRegistryBase *op_def_registry,
+            const OpRegistryBase *op_registry,
+            const NetDef *net_def,
             Workspace *ws,
-            Device *device,
+            Device *target_device,
             const NetMode mode = NetMode::NORMAL);
 
+  MaceStatus Init() override;
+
   MaceStatus Run(RunMetadata *run_metadata = nullptr) override;
 
  protected:
-  std::vector<std::unique_ptr<OperatorBase> > operators_;
-  Device *device_;
-  std::unique_ptr<OpKernelContext> op_kernel_context_;
+  Workspace *ws_;
+  Device *target_device_;
+  // CPU is base device.
+  Device *cpu_device_;
+  std::vector<std::unique_ptr<Operation> > operators_;
 
   MACE_DISABLE_COPY_AND_ASSIGN(SerialNet);
 };
 
-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const NetDef &net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode = NetMode::NORMAL);
-std::unique_ptr<NetBase> CreateNet(
-    const std::shared_ptr<const OperatorRegistryBase> op_registry,
-    const std::shared_ptr<const NetDef> net_def,
-    Workspace *ws,
-    Device *device,
-    const NetMode mode = NetMode::NORMAL);
-
 }  // namespace mace
 
 #endif  // MACE_CORE_NET_H_
diff --git a/mace/core/op_kernel_context.cc b/mace/core/op_context.cc
similarity index 61%
rename from mace/core/op_kernel_context.cc
rename to mace/core/op_context.cc
index 20f9e561..a26b5e22 100644
--- a/mace/core/op_kernel_context.cc
+++ b/mace/core/op_context.cc
@@ -12,21 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/op_kernel_context.h"
+#include "mace/core/op_context.h"
 
 namespace mace {
 
-OpKernelContext::OpKernelContext(Workspace *ws, Device *device)
-    : device_(device), ws_(ws) {}
+OpContext::OpContext(Workspace *ws, Device *device)
+    : device_(device), ws_(ws), future_(nullptr) {}
 
-OpKernelContext::~OpKernelContext() = default;
+OpContext::~OpContext() = default;
 
-Device* OpKernelContext::device() {
+void OpContext::set_device(Device *device) {
+  device_ = device;
+}
+
+Device* OpContext::device() {
   return device_;
 }
 
-Workspace* OpKernelContext::workspace() {
+Workspace* OpContext::workspace() {
   return ws_;
 }
 
+void OpContext::set_future(StatsFuture *future) {
+  future_ = future;
+}
+
+StatsFuture *OpContext::future() {
+  return future_;
+}
+
 }  // namespace mace
diff --git a/mace/core/op_kernel_context.h b/mace/core/op_context.h
similarity index 70%
rename from mace/core/op_kernel_context.h
rename to mace/core/op_context.h
index fe5e777c..6772b14f 100644
--- a/mace/core/op_kernel_context.h
+++ b/mace/core/op_context.h
@@ -12,23 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_CORE_OP_KERNEL_CONTEXT_H_
-#define MACE_CORE_OP_KERNEL_CONTEXT_H_
+#ifndef MACE_CORE_OP_CONTEXT_H_
+#define MACE_CORE_OP_CONTEXT_H_
 
 #include "mace/core/device.h"
 #include "mace/core/workspace.h"
+#include "mace/core/future.h"
+
 namespace mace {
 
-class OpKernelContext {
+class OpContext {
  public:
-  OpKernelContext(Workspace *ws, Device *device);
-  ~OpKernelContext();
+  OpContext(Workspace *ws, Device *device);
+  ~OpContext();
+  void set_device(Device *device);
   Device *device();
   Workspace *workspace();
+
+  void set_future(StatsFuture *future);
+  StatsFuture *future();
  private:
   Device *device_;
   Workspace *ws_;
+  StatsFuture *future_;
+  // metadata
 };
 
 }  // namespace mace
-#endif  // MACE_CORE_OP_KERNEL_CONTEXT_H_
+#endif  // MACE_CORE_OP_CONTEXT_H_
diff --git a/mace/core/op_def_registry.cc b/mace/core/op_def_registry.cc
new file mode 100644
index 00000000..7bb8de9e
--- /dev/null
+++ b/mace/core/op_def_registry.cc
@@ -0,0 +1,77 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/op_def_registry.h"
+#include "mace/utils/logging.h"
+
+namespace mace {
+
+void AddOpRegistrar(OpDefRegistryBase *registry,
+                    const OpRegistrationBuilder &builder) {
+  registry->AddRegistrar(
+      builder.name(),
+      [builder](OpRegistrationInfo *info){
+        builder.Finalize(info);
+      });
+}
+
+OpRegistrationBuilder::OpRegistrationBuilder(const std::string name)
+    : name_(name) {}
+
+const std::string OpRegistrationBuilder::name() const { return name_; }
+
+OpRegistrationBuilder &OpRegistrationBuilder::SetDevicePlaceFunc(
+    std::vector<DeviceType> (*func)()) {
+  info_.device_place_func_ = func;
+  return *this;
+}
+
+void OpRegistrationBuilder::Finalize(OpRegistrationInfo *info) const {
+  *info = info_;
+}
+
+void OpDefRegistryBase::AddRegistrar(const std::string name,
+                                    const OpRegistrar &registrar) {
+  registrar_.emplace(name, registrar);
+}
+
+MaceStatus OpDefRegistryBase::Register(const std::string &name) {
+  VLOG(3) << "Registering operation definition: " << name;
+  if (registry_.find(name) != registry_.end()) {
+    return MaceStatus::MACE_SUCCESS;
+  }
+  auto iter = registrar_.find(name);
+  if (iter == registrar_.end()) {
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "MACE do not support the operation: " + name);
+  }
+  registry_.emplace(
+      name, std::unique_ptr<OpRegistrationInfo>(new OpRegistrationInfo()));
+  iter->second(registry_[name].get());
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus OpDefRegistryBase::Find(const std::string &name,
+                                  const OpRegistrationInfo **info) {
+  auto iter = registry_.find(name);
+  if (iter == registry_.end()) {
+    *info = nullptr;
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "Mace do not support the operation: " + name);
+  }
+  *info = iter->second.get();
+  return MaceStatus::MACE_SUCCESS;
+}
+
+}  // namespace mace
diff --git a/mace/core/op_def_registry.h b/mace/core/op_def_registry.h
new file mode 100644
index 00000000..8e015658
--- /dev/null
+++ b/mace/core/op_def_registry.h
@@ -0,0 +1,81 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_CORE_OP_DEF_REGISTRY_H_
+#define MACE_CORE_OP_DEF_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "mace/proto/mace.pb.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+// Device placement function
+typedef std::function<std::vector<DeviceType>()> DevicePlaceFunc;
+
+struct OpRegistrationInfo {
+  OpRegistrationInfo() = default;
+  explicit OpRegistrationInfo(const DevicePlaceFunc &func)
+      : device_place_func_(func) {}
+
+  DevicePlaceFunc device_place_func_;
+};
+
+class OpRegistrationBuilder {
+ public:
+  explicit OpRegistrationBuilder(const std::string name);
+
+  const std::string name() const;
+
+  OpRegistrationBuilder &SetDevicePlaceFunc(
+      std::vector<DeviceType> (*func)());
+
+  void Finalize(OpRegistrationInfo *info) const;
+ private:
+  std::string name_;
+  OpRegistrationInfo info_;
+};
+
+class OpDefRegistryBase {
+ public:
+  typedef std::function<void(OpRegistrationInfo *)> OpRegistrar;
+  OpDefRegistryBase() = default;
+  virtual ~OpDefRegistryBase() = default;
+  void AddRegistrar(const std::string name, const OpRegistrar &registrar);
+  MaceStatus Register(const std::string &name);
+  MaceStatus Find(const std::string &name, const OpRegistrationInfo **info);
+
+ private:
+  std::unordered_map<std::string, OpRegistrar> registrar_;
+  std::unordered_map<
+      std::string,
+      std::unique_ptr<OpRegistrationInfo>> registry_;
+  MACE_DISABLE_COPY_AND_ASSIGN(OpDefRegistryBase);
+};
+
+void AddOpRegistrar(OpDefRegistryBase *registry,
+                    const OpRegistrationBuilder &builder);
+
+#define MACE_REGISTER_OP_DEF(op_def_registry, builder) \
+  AddOpRegistrar(op_def_registry, builder)
+
+}  // namespace mace
+
+#endif  // MACE_CORE_OP_DEF_REGISTRY_H_
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 5e404835..d29c84e3 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -14,18 +14,69 @@
 
 #include <sstream>
 #include <memory>
-#include <string>
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/core/op_kernel_context.h"
 
 namespace mace {
 
-OperatorBase::OperatorBase(const OperatorDef &operator_def,
-                           OpKernelContext *context)
-    : operator_def_(std::make_shared<OperatorDef>(operator_def)) {
-  MACE_UNUSED(context);
+OpConstructContext::OpConstructContext(Workspace *ws)
+    : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
+OpConstructContext::OpConstructContext(OperatorDef *operator_def,
+                                       Workspace *ws,
+                                       Device *device)
+    : operator_def_(operator_def), ws_(ws), device_(device) {}
+
+OpInitContext::OpInitContext(Workspace *ws, Device *device)
+    : ws_(ws), device_(device) {}
+
+Operation::Operation(OpConstructContext *context)
+    : operator_def_(std::make_shared<OperatorDef>(*(context->operator_def())))
+{}
+
+MaceStatus Operation::Init(OpInitContext *context) {
+  Workspace *ws = context->workspace();
+  for (const std::string &input_str : operator_def_->input()) {
+    const Tensor *tensor = ws->GetTensor(input_str);
+    MACE_CHECK(tensor != nullptr, "op ", operator_def_->type(),
+               ": Encountered a non-existing input tensor: ", input_str);
+    inputs_.push_back(tensor);
+  }
+  // TODO(liuqi): filter transform
+  for (int i = 0; i < operator_def_->output_size(); ++i) {
+    const std::string output_str = operator_def_->output(i);
+    if (ws->HasTensor(output_str)) {
+      // TODO(liuqi): Workspace should pre-allocate all of the output tensors
+      outputs_.push_back(ws->GetTensor(output_str));
+    } else {
+      MACE_CHECK(
+          operator_def_->output_type_size() == 0 ||
+              operator_def_->output_size() == operator_def_->output_type_size(),
+          "operator output size != operator output type size",
+          operator_def_->output_size(),
+          operator_def_->output_type_size());
+      DataType output_type;
+      if (i < operator_def_->output_type_size()) {
+        output_type = operator_def_->output_type(i);
+      } else {
+        output_type = static_cast<DataType>(
+            ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DT_FLOAT)));
+      }
+      outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
+          output_str, context->device()->allocator(), output_type)));
+
+      if (i < operator_def_->output_shape_size()) {
+        std::vector<index_t>
+            shape_configured(operator_def_->output_shape(i).dims_size());
+        for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
+          shape_configured[dim] = operator_def_->output_shape(i).dims(dim);
+        }
+        ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
+      }
+    }
+  }
+  return MaceStatus::MACE_SUCCESS;
 }
 
 OpKeyBuilder::OpKeyBuilder(const char *op_name) : op_name_(op_name) {}
@@ -36,7 +87,7 @@ OpKeyBuilder &OpKeyBuilder::Device(DeviceType device) {
 }
 
 OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name,
-                                           const DataType allowed) {
+                                           DataType allowed) {
   type_constraint_[attr_name] = allowed;
   return *this;
 }
@@ -53,27 +104,28 @@ const std::string OpKeyBuilder::Build() {
   return ss.str();
 }
 
-OperatorRegistryBase::~OperatorRegistryBase() {}
+OpRegistryBase::~OpRegistryBase() = default;
 
-std::unique_ptr<OperatorBase> OperatorRegistryBase::CreateOperator(
-    const OperatorDef &operator_def,
-    OpKernelContext *context,
-    DeviceType type,
+std::unique_ptr<Operation> OpRegistryBase::CreateOperation(
+    OpConstructContext *context,
+    DeviceType device_type,
     const NetMode mode) const {
+  OperatorDef *operator_def = context->operator_def();
   const int dtype = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      operator_def, "T", static_cast<int>(DT_FLOAT));
+      *operator_def, "T", static_cast<int>(DT_FLOAT));
   const int op_mode_i = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-      operator_def, "mode", static_cast<int>(NetMode::NORMAL));
+      *operator_def, "mode", static_cast<int>(NetMode::NORMAL));
   const NetMode op_mode = static_cast<NetMode>(op_mode_i);
-  VLOG(3) << "Creating operator " << operator_def.name() << "("
-          << operator_def.type() << "<" << dtype << ">" << ")";
+  VLOG(3) << "Creating operator " << operator_def->name() << "("
+          << operator_def->type() << "<" << dtype << ">" << ") on "
+          << device_type;
   if (op_mode == mode) {
     return registry_.Create(
-        OpKeyBuilder(operator_def.type().data())
-            .Device(type)
+        OpKeyBuilder(operator_def->type().data())
+            .Device(device_type)
             .TypeConstraint("T", static_cast<DataType>(dtype))
             .Build(),
-        operator_def, context);
+        context);
   } else {
     return nullptr;
   }
diff --git a/mace/core/operator.h b/mace/core/operator.h
index e0b84535..34de7e72 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -21,8 +21,7 @@
 #include <map>
 
 #include "mace/core/arg_helper.h"
-#include "mace/core/future.h"
-#include "mace/core/op_kernel_context.h"
+#include "mace/core/op_context.h"
 #include "mace/core/registry.h"
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
@@ -30,10 +29,66 @@
 
 namespace mace {
 
-class OperatorBase {
+// memory_optimizer, device
+class OpConstructContext {
  public:
-  explicit OperatorBase(const OperatorDef &operator_def, OpKernelContext *);
-  virtual ~OperatorBase() noexcept {}
+  explicit OpConstructContext(Workspace *ws);
+  OpConstructContext(OperatorDef *operator_def, Workspace *ws, Device *device);
+  ~OpConstructContext() = default;
+
+  inline void set_operator_def(OperatorDef *operator_def) {
+    operator_def_ = operator_def;
+  }
+
+  inline OperatorDef *operator_def() const {
+    return operator_def_;
+  }
+
+  inline Workspace *workspace() const {
+    return ws_;
+  }
+
+  inline void set_device(Device* device) {
+    device_ = device;
+  }
+
+  inline Device *device() const {
+    return device_;
+  }
+
+ private:
+  OperatorDef *operator_def_;
+  Workspace *ws_;
+  Device *device_;
+};
+
+// memory_optimizer, device
+class OpInitContext {
+ public:
+  explicit OpInitContext(Workspace *ws, Device *device = nullptr);
+  ~OpInitContext() = default;
+
+  inline Workspace *workspace() const {
+    return ws_;
+  }
+
+  inline void set_device(Device *device) {
+    device_ = device;
+  }
+
+  inline Device *device() const {
+    return device_;
+  }
+
+ private:
+  Workspace *ws_;
+  Device *device_;
+};
+
+class Operation {
+ public:
+  explicit Operation(OpConstructContext *context);
+  virtual ~Operation() = default;
 
   template <typename T>
   inline T GetOptionalArg(const std::string &name,
@@ -50,6 +105,10 @@ class OperatorBase {
         *operator_def_, name, default_value);
   }
 
+  inline DeviceType device_type() const {
+    return static_cast<DeviceType>(operator_def_->device_type());
+  }
+
   inline const Tensor *Input(unsigned int idx) {
     MACE_CHECK(idx < inputs_.size());
     return inputs_[idx];
@@ -63,7 +122,8 @@ class OperatorBase {
   inline const std::vector<Tensor *> &Outputs() { return outputs_; }
 
   // Run Op asynchronously (depends on device), return a future if not nullptr.
-  virtual MaceStatus Run(StatsFuture *future) = 0;
+  virtual MaceStatus Init(OpInitContext *);
+  virtual MaceStatus Run(OpContext *) = 0;
 
   inline const OperatorDef &debug_def() const {
     MACE_CHECK(has_debug_def(), "operator_def was null!");
@@ -82,55 +142,7 @@ class OperatorBase {
   std::vector<const Tensor *> inputs_;
   std::vector<Tensor *> outputs_;
 
-  MACE_DISABLE_COPY_AND_ASSIGN(OperatorBase);
-};
-
-template <DeviceType D, class T>
-class Operator : public OperatorBase {
- public:
-  explicit Operator(const OperatorDef &operator_def, OpKernelContext *context)
-      : OperatorBase(operator_def, context) {
-    Workspace *ws = context->workspace();
-    for (const std::string &input_str : operator_def.input()) {
-      const Tensor *tensor = ws->GetTensor(input_str);
-      MACE_CHECK(tensor != nullptr, "op ", operator_def.type(),
-                 ": Encountered a non-existing input tensor: ", input_str);
-      inputs_.push_back(tensor);
-    }
-
-    for (int i = 0; i < operator_def.output_size(); ++i) {
-      const std::string output_str = operator_def.output(i);
-      if (ws->HasTensor(output_str)) {
-        outputs_.push_back(ws->GetTensor(output_str));
-      } else {
-        MACE_CHECK(
-          operator_def.output_type_size() == 0
-          || operator_def.output_size() == operator_def.output_type_size(),
-          "operator output size != operator output type size",
-          operator_def.output_size(),
-          operator_def.output_type_size());
-        DataType output_type;
-        if (i < operator_def.output_type_size()) {
-          output_type = operator_def.output_type(i);
-        } else {
-          output_type = DataTypeToEnum<T>::v();
-        }
-        outputs_.push_back(MACE_CHECK_NOTNULL(ws->CreateTensor(
-          output_str, context->device()->allocator(), output_type)));
-
-        if (i < operator_def.output_shape_size()) {
-          std::vector<index_t>
-              shape_configured(operator_def.output_shape(i).dims_size());
-          for (size_t dim = 0; dim < shape_configured.size(); ++dim) {
-            shape_configured[dim] = operator_def.output_shape(i).dims(dim);
-          }
-          ws->GetTensor(output_str)->SetShapeConfigured(shape_configured);
-        }
-      }
-    }
-  }
-  MaceStatus Run(StatsFuture *future) override = 0;
-  ~Operator() noexcept override {}
+  MACE_DISABLE_COPY_AND_ASSIGN(Operation);
 };
 
 // MACE_OP_INPUT_TAGS and MACE_OP_OUTPUT_TAGS are optional features to name the
@@ -154,7 +166,8 @@ class OpKeyBuilder {
 
   OpKeyBuilder &Device(DeviceType device);
 
-  OpKeyBuilder &TypeConstraint(const char *attr_name, const DataType allowed);
+  OpKeyBuilder &TypeConstraint(const char *attr_name,
+                               DataType allowed);
 
   template <typename T>
   OpKeyBuilder &TypeConstraint(const char *attr_name);
@@ -172,33 +185,37 @@ OpKeyBuilder &OpKeyBuilder::TypeConstraint(const char *attr_name) {
   return this->TypeConstraint(attr_name, DataTypeToEnum<T>::value);
 }
 
-class OperatorRegistryBase {
+class OpRegistryBase {
  public:
   typedef Registry<std::string,
-                   OperatorBase,
-                   const OperatorDef &,
-                   OpKernelContext *>
+                   Operation,
+                   OpConstructContext *>
       RegistryType;
-  OperatorRegistryBase() = default;
-  virtual ~OperatorRegistryBase();
+  OpRegistryBase() = default;
+  virtual ~OpRegistryBase();
   RegistryType *registry() { return &registry_; }
-  std::unique_ptr<OperatorBase> CreateOperator(const OperatorDef &operator_def,
-                                               OpKernelContext *context,
-                                               DeviceType type,
-                                               const NetMode mode) const;
+  std::unique_ptr<Operation> CreateOperation(
+      OpConstructContext *context,
+      DeviceType device_type,
+      const NetMode mode) const;
 
  private:
   RegistryType registry_;
-  MACE_DISABLE_COPY_AND_ASSIGN(OperatorRegistryBase);
+  MACE_DISABLE_COPY_AND_ASSIGN(OpRegistryBase);
 };
 
 MACE_DECLARE_REGISTRY(OpRegistry,
-                      OperatorBase,
-                      const OperatorDef &,
-                      OpKernelContext *);
-
-#define MACE_REGISTER_OPERATOR(op_registry, name, ...) \
-  MACE_REGISTER_CLASS(OpRegistry, op_registry->registry(), name, __VA_ARGS__)
+                      Operation,
+                      OpConstructContext *);
+
+#define MACE_REGISTER_OP(op_registry, op_type, class_name, device, dt) \
+  MACE_REGISTER_CLASS(OpRegistry,                                      \
+                      op_registry->registry(),                         \
+                      OpKeyBuilder(op_type)                            \
+                        .Device(device)                                \
+                        .TypeConstraint<dt>("T")                       \
+                        .Build(),                                      \
+                      class_name<device, dt>)
 
 }  // namespace mace
 
diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index ac8a3582..ae168a54 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -18,12 +18,13 @@
 #include <omp.h>
 #endif
 
-#include <errno.h>
 #include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
-#include <string.h>
 #include <algorithm>
+#include <cerrno>
+#include <cstring>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -85,9 +86,10 @@ MaceStatus SetThreadAffinity(cpu_set_t mask) {
   int err = sched_setaffinity(pid, sizeof(mask), &mask);
   if (err) {
     LOG(WARNING) << "set affinity error: " << strerror(errno);
-    return MACE_INVALID_ARGS;
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "set affinity error: " + std::string(strerror(errno)));
   } else {
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 }
 
@@ -104,7 +106,9 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
     if (cpu_max_freq[i] == 0) {
       LOG(WARNING) << "Cannot get CPU" << i
                    << "'s max frequency info, maybe it is offline.";
-      return MACE_INVALID_ARGS;
+      return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                        "Cannot get CPU's max frequency info,"
+                        " maybe it is offline.");
     }
   }
 
@@ -124,7 +128,7 @@ MaceStatus GetCPUBigLittleCoreIDs(std::vector<int> *big_core_ids,
     }
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
@@ -147,7 +151,8 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
     CPU_SET(cpu_id, &mask);
   }
 #ifdef MACE_ENABLE_OPENMP
-  std::vector<MaceStatus> status(omp_num_threads);
+  std::vector<MaceStatus> status(omp_num_threads,
+                                 MaceStatus::MACE_INVALID_ARGS);
 #pragma omp parallel for
   for (int i = 0; i < omp_num_threads; ++i) {
     VLOG(1) << "Set affinity for OpenMP thread " << omp_get_thread_num()
@@ -155,10 +160,10 @@ MaceStatus SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads,
     status[i] = SetThreadAffinity(mask);
   }
   for (int i = 0; i < omp_num_threads; ++i) {
-    if (status[i] != MACE_SUCCESS)
-      return MACE_INVALID_ARGS;
+    if (status[i] != MaceStatus::MACE_SUCCESS)
+      return MaceStatus::MACE_INVALID_ARGS;
   }
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 #else
   MaceStatus status = SetThreadAffinity(mask);
   VLOG(1) << "Set affinity without OpenMP: " << mask.__bits[0];
@@ -183,13 +188,13 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
 #else
     LOG(WARNING) << "Set OpenMP threads number failed: OpenMP not enabled.";
 #endif
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
   std::vector<int> big_core_ids;
   std::vector<int> little_core_ids;
   MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids);
-  if (res != MACE_SUCCESS) {
+  if (res != MaceStatus::MACE_SUCCESS) {
     return res;
   }
 
diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h
index 4b0f796b..a6926e9e 100644
--- a/mace/core/runtime/cpu/cpu_runtime.h
+++ b/mace/core/runtime/cpu/cpu_runtime.h
@@ -55,6 +55,14 @@ class CPURuntime {
     return num_threads_;
   }
 
+  CPUAffinityPolicy policy() const {
+    return policy_;
+  }
+
+  bool use_gemmlowp() const {
+    return gemm_context_ != nullptr;
+  }
+
  private:
   MaceStatus SetOpenMPThreadsAndAffinityPolicy(
       int omp_num_threads_hint,
diff --git a/mace/core/runtime/opencl/gpu_device.cc b/mace/core/runtime/opencl/gpu_device.cc
index 112a94bf..09bb9181 100644
--- a/mace/core/runtime/opencl/gpu_device.cc
+++ b/mace/core/runtime/opencl/gpu_device.cc
@@ -38,7 +38,7 @@ OpenCLRuntime* GPUDevice::opencl_runtime() {
   return runtime_.get();
 }
 
-Allocator* GPUDevice::allocator() {
+Allocator *GPUDevice::allocator() {
   return allocator_.get();
 }
 
diff --git a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
index ae5bbc78..b414782a 100755
--- a/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
+++ b/mace/examples/android/macelibrary/src/main/cpp/image_classify.cc
@@ -27,6 +27,7 @@
 
 #include "src/main/cpp/include/mace/public/mace.h"
 #include "src/main/cpp/include/mace/public/mace_engine_factory.h"
+#include "mace/public/mace.h"
 
 namespace {
 
@@ -112,11 +113,12 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
       omp_num_threads,
       static_cast<mace::CPUAffinityPolicy>(cpu_affinity_policy),
       true);
-  if (status != mace::MACE_SUCCESS) {
+  if (status != mace::MaceStatus::MACE_SUCCESS) {
     __android_log_print(ANDROID_LOG_ERROR,
                         "image_classify attrs",
-                        "openmp result: %d, threads: %d, cpu: %d",
-                        status, omp_num_threads, cpu_affinity_policy);
+                        "openmp result: %s, threads: %d, cpu: %d",
+                        status.information().c_str(), omp_num_threads,
+                        cpu_affinity_policy);
   }
   if (mace_context.device_type == mace::DeviceType::GPU) {
     config.SetGPUContext(mace_context.gpu_context);
@@ -163,8 +165,8 @@ Java_com_xiaomi_mace_JniMaceUtils_maceMobilenetCreateEngine(
 
   __android_log_print(ANDROID_LOG_INFO,
                       "image_classify attrs",
-                      "create result: %d",
-                      create_engine_status);
+                      "create result: %s",
+                      create_engine_status.information().c_str());
 
   return create_engine_status == mace::MaceStatus::MACE_SUCCESS ?
          JNI_OK : JNI_ERR;
diff --git a/mace/examples/cli/example.cc b/mace/examples/cli/example.cc
index 6679c17e..b16c9d7d 100644
--- a/mace/examples/cli/example.cc
+++ b/mace/examples/cli/example.cc
@@ -170,7 +170,7 @@ bool RunModel(const std::vector<std::string> &input_names,
   status = config.SetCPUThreadPolicy(
       FLAGS_omp_num_threads,
       static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy));
-  if (status != MACE_SUCCESS) {
+  if (status != MaceStatus::MACE_SUCCESS) {
     std::cerr << "Set openmp or cpu affinity failed." << std::endl;
   }
 #ifdef MACE_ENABLE_OPENCL
diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD
index 6b37cf50..88528578 100644
--- a/mace/kernels/BUILD
+++ b/mace/kernels/BUILD
@@ -28,12 +28,20 @@ cc_library(
             "*_test.cc",
             "*_benchmark.cc",
             "arm/*_test.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
         ],
     ) + if_opencl_enabled(glob(
         [
             "opencl/*.cc",
             "opencl/image/*.cc",
             "opencl/buffer/*.cc",
+            "buffer_inverse_transform.cc",
+            "buffer_transform.cc",
+            "lstm_cell.cc",
+            "winograd_transform.cc",
         ],
         exclude = [
             "opencl/*_test.cc",
@@ -44,18 +52,10 @@ cc_library(
             "*.h",
             "arm/*.h",
         ],
-        exclude = [
-            "buffer_transform.h",
-            "buffer_inverse_transform.h",
-            "lstmcell.h",
-        ],
     ) + if_opencl_enabled(glob([
         "opencl/*.h",
         "opencl/image/*.h",
         "opencl/buffer/*.h",
-        "buffer_transform.h",
-        "buffer_inverse_transform.h",
-        "lstmcell.h",
     ])),
     copts = [
         "-Werror",
@@ -77,7 +77,6 @@ cc_library(
     linkopts = if_android(["-lm"]),
     deps = [
         "//mace/core",
-        "//mace/utils",
         "@gemmlowp",
         "@tflite",
     ],
diff --git a/mace/kernels/activation.cc b/mace/kernels/activation.cc
new file mode 100644
index 00000000..038c4549
--- /dev/null
+++ b/mace/kernels/activation.cc
@@ -0,0 +1,118 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/activation.h"
+
+#include <memory>
+
+#include "mace/core/operator.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/activation.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, class T>
+class ActivationOp;
+
+template <>
+class ActivationOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit ActivationOp(OpConstructContext *context)
+      : Operation(context),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
+                                                         0.0f)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    const float *input_ptr = input->data<float>();
+    float *output_ptr = output->mutable_data<float>();
+    if (activation_ == PRELU) {
+      MACE_CHECK(this->InputSize() > 1);
+      const Tensor *alpha = this->Input(1);
+      const float *alpha_ptr = alpha->data<float>();
+      const index_t outer_size = output->dim(0);
+      const index_t inner_size = output->dim(2) * output->dim(3);
+      PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size,
+                      alpha_ptr, output_ptr);
+    } else {
+      DoActivation(input_ptr, output_ptr, output->size(), activation_,
+                   relux_max_limit_);
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  ActivationType activation_;
+  float relux_max_limit_;
+};
+
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class ActivationOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit ActivationOp(OpConstructContext *context)
+      : Operation(context) {
+    ActivationType type = kernels::StringToActivationType(
+        Operation::GetOptionalArg<std::string>("activation",
+                                              "NOOP"));
+    auto relux_max_limit = static_cast<T>(
+        Operation::GetOptionalArg<float>("max_limit", 0.0f));
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(
+          new opencl::image::ActivationKernel<T>(type, relux_max_limit));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *alpha = this->InputSize() > 1 ? this->Input(1) : nullptr;
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    return kernel_->Compute(context, input, alpha, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLActivationKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterActivation(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Activation", ActivationOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h
index 66ec407f..12728465 100644
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -17,15 +17,11 @@
 
 #include <algorithm>
 #include <cmath>
-#include <memory>
 #include <string>
-#include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
 #include "mace/core/types.h"
-#include "mace/kernels/kernel.h"
 #include "mace/kernels/arm/activation_neon.h"
+#include "mace/utils/logging.h"
 
 namespace mace {
 namespace kernels {
@@ -153,73 +149,6 @@ void PReLUActivation(const T *input_ptr,
   }
 }
 
-template <DeviceType D, typename T>
-class ActivationFunctor;
-
-template <>
-class ActivationFunctor<DeviceType::CPU, float> : OpKernel {
- public:
-  ActivationFunctor(OpKernelContext *context,
-                    ActivationType type,
-                    float relux_max_limit)
-      : OpKernel(context),
-        activation_(type),
-        relux_max_limit_(relux_max_limit) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *alpha,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    const float *input_ptr = input->data<float>();
-    float *output_ptr = output->mutable_data<float>();
-    if (activation_ == PRELU) {
-      MACE_CHECK_NOTNULL(alpha);
-      const float *alpha_ptr = alpha->data<float>();
-      const index_t outer_size = output->dim(0);
-      const index_t inner_size = output->dim(2) * output->dim(3);
-      PReLUActivation(input_ptr, outer_size, input->dim(1), inner_size,
-                      alpha_ptr, output_ptr);
-    } else {
-      DoActivation(input_ptr, output_ptr, output->size(), activation_,
-                   relux_max_limit_);
-    }
-    return MACE_SUCCESS;
-  }
-
- private:
-  ActivationType activation_;
-  float relux_max_limit_;
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLActivationKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *alpha,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLActivationKernel);
-};
-template <typename T>
-class ActivationFunctor<DeviceType::GPU, T> : OpKernel {
- public:
-  ActivationFunctor(OpKernelContext *context,
-                    ActivationType type,
-                    T relux_max_limit);
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *alpha,
-                        Tensor *output,
-                        StatsFuture *future);
-
- private:
-  std::unique_ptr<OpenCLActivationKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/addn.cc b/mace/kernels/addn.cc
new file mode 100644
index 00000000..6634e8e8
--- /dev/null
+++ b/mace/kernels/addn.cc
@@ -0,0 +1,146 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+#include <algorithm>
+#include <memory>
+
+#include "mace/core/operator.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/addn.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+static constexpr int kCostPerGroup = 1024;
+
+template <DeviceType D, class T>
+class AddNOp;
+
+template <>
+class AddNOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit AddNOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Tensor *output_tensor = this->Output(0);
+    size_t input_size = this->inputs_.size();
+    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(inputs_[0]));
+    index_t size = output_tensor->size();
+    Tensor::MappingGuard output_map(output_tensor);
+    float *output_data = output_tensor->mutable_data<float>();
+    memset(output_data, 0, size * sizeof(float));
+    int64_t cost = size * input_size;
+    int64_t groups = 1;
+    if (cost > kCostPerGroup) {
+      groups = cost / kCostPerGroup;
+    }
+    int64_t element_per_group = size / groups;
+
+    std::vector<Tensor::MappingGuard> mappers;
+    for (size_t i = 0; i < input_size; ++i) {
+      MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size());
+      MACE_CHECK(inputs_[0]->size() == inputs_[i]->size())
+        << "Input 0: " << MakeString(inputs_[0]->shape())
+        << ", size: " << inputs_[0]->size() << ". Input " << i << ": "
+        << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size();
+      mappers.emplace_back(Tensor::MappingGuard(inputs_[i]));
+    }
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < size; i += element_per_group) {
+      int64_t count = std::min(element_per_group, size - i);
+      int nn = count >> 2;
+      int remain = count - (nn << 2);
+      for (size_t j = 0; j < input_size; ++j) {
+        const float *input_data = inputs_[j]->data<float>();
+        const float *input_ptr = input_data + i;
+        float *output_ptr = output_data + i;
+        for (int k = 0; k < nn; ++k) {
+#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
+          float32x4_t in = vld1q_f32(input_ptr);
+          float32x4_t out = vld1q_f32(output_ptr);
+          out = vaddq_f32(out, in);
+          vst1q_f32(output_ptr, out);
+#else
+          for (int m = 0; m < 4; ++m) {
+            output_ptr[m] += input_ptr[m];
+          }
+#endif
+
+          input_ptr += 4;
+          output_ptr += 4;
+        }
+        for (int k = 0; k < remain; ++k) {
+          *output_ptr += *input_ptr;
+          ++input_ptr;
+          ++output_ptr;
+        }
+      }
+    }
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class AddNOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit AddNOp(OpConstructContext *context)
+      : Operation(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::AddNKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    Tensor *output_tensor = this->Output(0);
+    size_t n = this->inputs_.size();
+    for (size_t i = 1; i < n; ++i) {
+      MACE_CHECK(inputs_[0]->dim_size() == inputs_[i]->dim_size());
+      MACE_CHECK(inputs_[0]->size() == inputs_[i]->size())
+        << "Input 0: " << MakeString(inputs_[0]->shape())
+        << ", size: " << inputs_[0]->size() << ". Input " << i << ": "
+        << MakeString(inputs_[i]->shape()) << ", size: " << inputs_[i]->size();
+    }
+
+    return kernel_->Compute(context, inputs_, output_tensor);
+  }
+
+ private:
+  std::unique_ptr<OpenCLAddNKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterAddN(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "AddN", AddNOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h
deleted file mode 100644
index 2fa3e21a..00000000
--- a/mace/kernels/addn.h
+++ /dev/null
@@ -1,118 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_ADDN_H_
-#define MACE_KERNELS_ADDN_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-
-namespace mace {
-namespace kernels {
-
-constexpr int kCostPerGroup = 1024;
-
-template <DeviceType D, typename T>
-struct AddNFunctor : OpKernel {
-  explicit AddNFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
-                        Tensor *output_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensors[0]));
-    index_t size = output_tensor->size();
-    Tensor::MappingGuard output_map(output_tensor);
-    float *output_data = output_tensor->mutable_data<float>();
-    memset(output_data, 0, size * sizeof(float));
-    int n = input_tensors.size();
-    int64_t cost = size * n;
-    int64_t groups = 1;
-    if (cost > kCostPerGroup) {
-      groups = cost / kCostPerGroup;
-    }
-    int64_t element_per_group = size / groups;
-
-    std::vector<Tensor::MappingGuard> mappers;
-    for (int64_t i = 0; i < n; ++i) {
-      mappers.emplace_back(Tensor::MappingGuard(input_tensors[i]));
-    }
-
-#pragma omp parallel for
-    for (int64_t i = 0; i < size; i += element_per_group) {
-      int64_t count = std::min(element_per_group, size - i);
-      int nn = count >> 2;
-      int remain = count - (nn << 2);
-      for (int64_t j = 0; j < n; ++j) {
-        const float *input_data = input_tensors[j]->data<float>();
-        const float *input_ptr = input_data + i;
-        float *output_ptr = output_data + i;
-        for (int k = 0; k < nn; ++k) {
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-          float32x4_t in = vld1q_f32(input_ptr);
-          float32x4_t out = vld1q_f32(output_ptr);
-          out = vaddq_f32(out, in);
-          vst1q_f32(output_ptr, out);
-#else
-          for (int m = 0; m < 4; ++m) {
-            output_ptr[m] += input_ptr[m];
-          }
-#endif
-
-          input_ptr += 4;
-          output_ptr += 4;
-        }
-        for (int k = 0; k < remain; ++k) {
-          *output_ptr += *input_ptr;
-          ++input_ptr;
-          ++output_ptr;
-        }
-      }
-    }
-    return MACE_SUCCESS;
-  }
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLAddNKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const std::vector<const Tensor *> &input_tensors,
-      Tensor *output_tensor,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLAddNKernel);
-};
-template <typename T>
-struct AddNFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit AddNFunctor(OpKernelContext *context);
-  MaceStatus operator()(const std::vector<const Tensor *> &input_tensors,
-                        Tensor *output_tensor,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLAddNKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_ADDN_H_
diff --git a/mace/kernels/argmax.h b/mace/kernels/argmax.cc
similarity index 77%
rename from mace/kernels/argmax.h
rename to mace/kernels/argmax.cc
index 36218d62..19d52f7f 100644
--- a/mace/kernels/argmax.h
+++ b/mace/kernels/argmax.cc
@@ -12,32 +12,28 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_ARGMAX_H_
-#define MACE_KERNELS_ARGMAX_H_
-
 #include <algorithm>
 #include <functional>
 #include <limits>
 #include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct ArgMaxFunctor : OpKernel {
-  explicit ArgMaxFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *axis,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <DeviceType D, class T>
+class ArgMaxOp : public Operation {
+ public:
+  explicit ArgMaxOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    const Tensor *axis = this->Input(1);
+    Tensor *output = this->Output(0);
 
     MACE_CHECK(input->dim_size() > 0, "ArgMax input should not be a scalar");
     MACE_CHECK(axis->dim_size() == 0, "Mace argmax only supports scalar axis");
@@ -77,11 +73,16 @@ struct ArgMaxFunctor : OpKernel {
       output_data[i] = idx;
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
+
+
+void RegisterArgMax(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ArgMax", ArgMaxOp,
+                   DeviceType::CPU, float);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_ARGMAX_H_
diff --git a/mace/kernels/batch_norm.cc b/mace/kernels/batch_norm.cc
new file mode 100644
index 00000000..b07f2f43
--- /dev/null
+++ b/mace/kernels/batch_norm.cc
@@ -0,0 +1,209 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/operator.h"
+#include "mace/kernels/activation.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/batch_norm.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, class T>
+class BatchNormOp;
+
+template <>
+class BatchNormOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit BatchNormOp(OpConstructContext *context)
+      : Operation(context),
+        epsilon_(Operation::GetOptionalArg<float>("epsilon",
+                                                 static_cast<float>(1e-4))),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    bool not_folded = this->InputSize() == 5;
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *scale = this->Input(SCALE);
+    const Tensor *offset = this->Input(OFFSET);
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
+               input->dim_size());
+    MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
+               scale->dim_size());
+    MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
+               offset->dim_size());
+
+    Tensor *output = this->Output(OUTPUT);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
+    // The calculation formula for inference is
+    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
+    //          ( \offset - \frac { \scale * mean } {
+    //          \sqrt{var+\variance_epsilon} }
+    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
+    // new_offset = \offset - mean * common_val;
+    // Y = new_scale * X + new_offset;
+    const index_t batch = input->dim(0);
+    const index_t channels = input->dim(1);
+    const index_t height = input->dim(2);
+    const index_t width = input->dim(3);
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard scale_mapper(scale);
+    Tensor::MappingGuard offset_mapper(offset);
+    Tensor::MappingGuard output_mapper(output);
+
+    const float *input_ptr = input->data<float>();
+    const float *scale_ptr = scale->data<float>();
+    const float *offset_ptr = offset->data<float>();
+    float *output_ptr = output->mutable_data<float>();
+
+    std::vector<float> new_scale;
+    std::vector<float> new_offset;
+    if (not_folded) {
+      const Tensor *mean = this->Input(MEAN);
+      const Tensor *var = this->Input(VAR);
+      MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
+                 mean->dim_size());
+      MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
+                 var->dim_size());
+      new_scale.resize(channels);
+      new_offset.resize(channels);
+      Tensor::MappingGuard mean_mapper(mean);
+      Tensor::MappingGuard var_mapper(var);
+      const float *mean_ptr = mean->data<float>();
+      const float *var_ptr = var->data<float>();
+#pragma omp parallel for
+      for (index_t c = 0; c < channels; ++c) {
+        new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon_);
+        new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
+      }
+    }
+
+    const float *scale_data = not_folded ? new_scale.data() : scale_ptr;
+    const float
+        *offset_data = not_folded ? new_offset.data() : offset_ptr;
+
+    index_t channel_size = height * width;
+    index_t batch_size = channels * channel_size;
+
+    // NEON is slower, so stick to the trivial implementaion
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channels; ++c) {
+        index_t offset = b * batch_size + c * channel_size;
+        for (index_t hw = 0; hw < height * width; ++hw) {
+          output_ptr[offset + hw] =
+              scale_data[c] * input_ptr[offset + hw] + offset_data[c];
+        }
+      }
+    }
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
+                 relux_max_limit_);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  float epsilon_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
+
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class BatchNormOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit BatchNormOp(OpConstructContext *context)
+      : Operation(context) {
+    float epsilon = Operation::GetOptionalArg<float>(
+        "epsilon", static_cast<float>(1e-4));
+    ActivationType activation = kernels::StringToActivationType(
+        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
+    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BatchNormKernel<T>(
+          epsilon, activation, relux_max_limit));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    bool not_folded = this->InputSize() == 5;
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *scale = this->Input(SCALE);
+    const Tensor *offset = this->Input(OFFSET);
+    const Tensor *mean = not_folded ? this->Input(MEAN) : nullptr;
+    const Tensor *var = not_folded ? this->Input(VAR) : nullptr;
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
+               input->dim_size());
+    MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
+               scale->dim_size());
+    MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
+               offset->dim_size());
+    if (not_folded) {
+      MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
+                 mean->dim_size());
+      MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
+                 var->dim_size());
+    }
+
+    Tensor *output = this->Output(OUTPUT);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    return kernel_->Compute(context, input, scale, offset, mean,
+                            var, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLBatchNormKernel> kernel_;
+
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterBatchNorm(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BatchNorm", BatchNormOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h
deleted file mode 100644
index 75e58f93..00000000
--- a/mace/kernels/batch_norm.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_BATCH_NORM_H_
-#define MACE_KERNELS_BATCH_NORM_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/public/mace.h"
-
-namespace mace {
-namespace kernels {
-
-template<DeviceType D, typename T>
-struct BatchNormFunctor;
-
-template<>
-struct BatchNormFunctor<DeviceType::CPU, float> : OpKernel {
-  BatchNormFunctor(OpKernelContext *context,
-                   const bool folded_constant,
-                   const ActivationType activation,
-                   const float relux_max_limit)
-      : OpKernel(context),
-        folded_constant_(folded_constant),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
-
-  MaceStatus operator()(const Tensor *input,
-                  const Tensor *scale,
-                  const Tensor *offset,
-                  const Tensor *mean,
-                  const Tensor *var,
-                  const float epsilon,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
-    // Batch normalization in the paper https://arxiv.org/abs/1502.03167 .
-    // The calculation formula for inference is
-    // Y = \frac{ \scale } { \sqrt{var+\variance_epsilon} } * X +
-    //          ( \offset - \frac { \scale * mean } {
-    //          \sqrt{var+\variance_epsilon} }
-    // new_scale = \frac{ \scale } { \sqrt{var+\variance_epsilon} }
-    // new_offset = \offset - mean * common_val;
-    // Y = new_scale * X + new_offset;
-    const index_t batch = input->dim(0);
-    const index_t channels = input->dim(1);
-    const index_t height = input->dim(2);
-    const index_t width = input->dim(3);
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard scale_mapper(scale);
-    Tensor::MappingGuard offset_mapper(offset);
-    Tensor::MappingGuard output_mapper(output);
-
-    const float *input_ptr = input->data<float>();
-    const float *scale_ptr = scale->data<float>();
-    const float *offset_ptr = offset->data<float>();
-    float *output_ptr = output->mutable_data<float>();
-
-    std::vector<float> new_scale;
-    std::vector<float> new_offset;
-    if (!folded_constant_) {
-      new_scale.resize(channels);
-      new_offset.resize(channels);
-      Tensor::MappingGuard mean_mapper(mean);
-      Tensor::MappingGuard var_mapper(var);
-      const float *mean_ptr = mean->data<float>();
-      const float *var_ptr = var->data<float>();
-#pragma omp parallel for
-      for (index_t c = 0; c < channels; ++c) {
-        new_scale[c] = scale_ptr[c] / std::sqrt(var_ptr[c] + epsilon);
-        new_offset[c] = offset_ptr[c] - mean_ptr[c] * new_scale[c];
-      }
-    }
-
-    const float *scale_data = folded_constant_ ? scale_ptr : new_scale.data();
-    const float
-      *offset_data = folded_constant_ ? offset_ptr : new_offset.data();
-
-    index_t channel_size = height * width;
-    index_t batch_size = channels * channel_size;
-
-    // NEON is slower, so stick to the trivial implementaion
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channels; ++c) {
-        index_t offset = b * batch_size + c * channel_size;
-        for (index_t hw = 0; hw < height * width; ++hw) {
-          output_ptr[offset + hw] =
-            scale_data[c] * input_ptr[offset + hw] + offset_data[c];
-        }
-      }
-    }
-    DoActivation(output_ptr, output_ptr, output->size(), activation_,
-                 relux_max_limit_);
-
-    return MACE_SUCCESS;
-  }
-
-  const bool folded_constant_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLBatchNormKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *scale,
-      const Tensor *offset,
-      const Tensor *mean,
-      const Tensor *var,
-      const float epsilon,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchNormKernel);
-};
-template<typename T>
-struct BatchNormFunctor<DeviceType::GPU, T> : OpKernel {
-  BatchNormFunctor(OpKernelContext *context,
-                   const bool folded_constant,
-                   const ActivationType activation,
-                   const float relux_max_limit);
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *scale,
-                        const Tensor *offset,
-                        const Tensor *mean,
-                        const Tensor *var,
-                        const float epsilon,
-                        Tensor *output,
-                        StatsFuture *future);
-  std::unique_ptr<OpenCLBatchNormKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_BATCH_NORM_H_
diff --git a/mace/kernels/batch_to_space.h b/mace/kernels/batch_to_space.cc
similarity index 77%
rename from mace/kernels/batch_to_space.h
rename to mace/kernels/batch_to_space.cc
index 45b2ff88..5df98aef 100644
--- a/mace/kernels/batch_to_space.h
+++ b/mace/kernels/batch_to_space.cc
@@ -12,34 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_BATCH_TO_SPACE_H_
-#define MACE_KERNELS_BATCH_TO_SPACE_H_
-
-#include <memory>
-#include <vector>
 #include <algorithm>
+#include <memory>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/batch_to_space.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-struct BatchToSpaceFunctorBase : OpKernel {
-  BatchToSpaceFunctorBase(OpKernelContext *context,
-                          const std::vector<int> &paddings,
-                          const std::vector<int> &block_shape)
-      : OpKernel(context),
-        paddings_(paddings.begin(), paddings.end()),
-        block_shape_(block_shape.begin(), block_shape.end()) {
+class BatchToSpaceOpBase : public Operation {
+ public:
+  explicit BatchToSpaceOpBase(OpConstructContext *context)
+      : Operation(context),
+        paddings_(Operation::GetRepeatedArgs<int>("crops", {0, 0, 0, 0})),
+        block_shape_(Operation::GetRepeatedArgs<int>("block_shape", {1, 1})) {
     MACE_CHECK(
-        block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
+        block_shape_.size() == 2 && block_shape_[0] > 1 && block_shape_[1] > 1,
         "Block's shape should be 1D, and greater than 1");
-    MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
+    MACE_CHECK(paddings_.size() == 4, "Paddings' shape should be 2D");
   }
 
+ protected:
   std::vector<int> paddings_;
   std::vector<int> block_shape_;
 
@@ -83,21 +79,19 @@ struct BatchToSpaceFunctorBase : OpKernel {
   }
 };
 
-template<DeviceType D, typename T>
-struct BatchToSpaceFunctor;
-
-template<>
-struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
-  BatchToSpaceFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
-      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
+template <DeviceType D, class T>
+class BatchToSpaceNDOp;
 
-  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *space_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class BatchToSpaceNDOp<DeviceType::CPU, float> : public BatchToSpaceOpBase {
+ public:
+  explicit BatchToSpaceNDOp(OpConstructContext *context)
+      : BatchToSpaceOpBase(context) {}
 
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *batch_tensor = this->Input(0);
+    Tensor *space_tensor = this->Output(0);
     std::vector<index_t> output_shape(4, 0);
     CalculateBatchToSpaceOutputShape(batch_tensor,
                                      DataFormat::NCHW,
@@ -177,24 +171,21 @@ struct BatchToSpaceFunctor<DeviceType::CPU, float> : BatchToSpaceFunctorBase {
       }  // block_h
     }  // c
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
-template<>
-struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
-  BatchToSpaceFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
-      : BatchToSpaceFunctorBase(context, paddings, block_shape) {}
-
-  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *space_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class BatchToSpaceNDOp<DeviceType::CPU, uint8_t> : public BatchToSpaceOpBase {
+ public:
+  explicit BatchToSpaceNDOp(OpConstructContext *context)
+      : BatchToSpaceOpBase(context) {}
 
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *batch_tensor = this->Input(0);
+    Tensor *space_tensor = this->Output(0);
     std::vector<index_t> output_shape(4, 0);
-
     CalculateBatchToSpaceOutputShape(batch_tensor,
                                      DataFormat::NHWC,
                                      output_shape.data());
@@ -264,38 +255,53 @@ struct BatchToSpaceFunctor<CPU, uint8_t> : BatchToSpaceFunctorBase {
       }  // h
     }  // b
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLBatchToSpaceKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *batch_tensor,
-      const std::vector<int> &paddings,
-      const std::vector<int> &block_shape,
-      const std::vector<index_t> &output_shape,
-      Tensor *space_tensor,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBatchToSpaceKernel);
-};
 template <typename T>
-struct BatchToSpaceFunctor<DeviceType::GPU, T> : BatchToSpaceFunctorBase {
-  BatchToSpaceFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape);
-
-  MaceStatus operator()(const Tensor *batch_tensor,
-                        Tensor *space_tensor,
-                        StatsFuture *future);
+class BatchToSpaceNDOp<DeviceType::GPU, T> : public BatchToSpaceOpBase {
+ public:
+  explicit BatchToSpaceNDOp(OpConstructContext *context)
+      : BatchToSpaceOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *batch_tensor = this->Input(0);
+    Tensor *space_tensor = this->Output(0);
+    std::vector<index_t> output_shape(4, 0);
+    CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
+                                     output_shape.data());
+    return kernel_->Compute(context, batch_tensor, paddings_, block_shape_,
+                            output_shape, space_tensor);
+  }
 
+ private:
   std::unique_ptr<OpenCLBatchToSpaceKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterBatchToSpaceND(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BatchToSpaceND",
+                   BatchToSpaceNDOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_BATCH_TO_SPACE_H_
diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.cc
similarity index 50%
rename from mace/kernels/bias_add.h
rename to mace/kernels/bias_add.cc
index d58a4d93..fc8b7374 100644
--- a/mace/kernels/bias_add.h
+++ b/mace/kernels/bias_add.cc
@@ -12,43 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_BIAS_ADD_H_
-#define MACE_KERNELS_BIAS_ADD_H_
-
 #include <functional>
 #include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
+#include "mace/kernels/activation.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/bias_add.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-struct BiasAddFunctorBase : OpKernel {
-  BiasAddFunctorBase(OpKernelContext *context,
-                     const DataFormat data_format)
-      : OpKernel(context), data_format_(data_format) {}
+template <DeviceType D, class T>
+class BiasAddOp;
 
-  DataFormat data_format_;
-};
+template <>
+class BiasAddOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit BiasAddOp(OpConstructContext *context)
+      : Operation(context),
+        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
+                     "data_format", NHWC))) {}
 
-template <DeviceType D, typename T>
-struct BiasAddFunctor;
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    const Tensor *bias = this->Input(1);
 
-template <>
-struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
-  BiasAddFunctor(OpKernelContext *context,
-                 const DataFormat data_format)
-      : BiasAddFunctorBase(context, data_format) {}
+    MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ",
+               bias->dim_size());
 
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
     Tensor::MappingGuard input_mapper(input);
     Tensor::MappingGuard bias_mapper(bias);
@@ -87,35 +84,60 @@ struct BiasAddFunctor<DeviceType::CPU, float> : BiasAddFunctorBase {
       }
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
-};
 
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLBiasAddKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *bias,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBiasAddKernel);
+ private:
+  DataFormat data_format_;
 };
 
+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct BiasAddFunctor<DeviceType::GPU, T> : BiasAddFunctorBase {
-  BiasAddFunctor(OpKernelContext *context, const DataFormat data_format);
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future);
+class BiasAddOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit BiasAddOp(OpConstructContext *context)
+      : Operation(context),
+        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
+            "data_format", NHWC))) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BiasAddKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *bias = this->Input(1);
+
+    MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ",
+               bias->dim_size());
+
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+    MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
+               "gpu only support biasadd for 4-dimensional NHWC format tensor");
+    return kernel_->Compute(context, input, bias, output);
+  }
 
+ private:
+  DataFormat data_format_;
   std::unique_ptr<OpenCLBiasAddKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterBiasAdd(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BiasAdd", BiasAddOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_BIAS_ADD_H_
diff --git a/mace/kernels/buffer_inverse_transform.cc b/mace/kernels/buffer_inverse_transform.cc
new file mode 100644
index 00000000..b447334c
--- /dev/null
+++ b/mace/kernels/buffer_inverse_transform.cc
@@ -0,0 +1,67 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "mace/core/operator.h"
+#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
+#include "mace/kernels/opencl/image/image_to_buffer.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, class T>
+class BufferInverseTransformOp;
+
+template <typename T>
+class BufferInverseTransformOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit BufferInverseTransformOp(OpConstructContext *context)
+      : Operation(context),
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ImageToBuffer<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    kernels::BufferType type =
+        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+
+    return kernel_->Compute(context, input, type,
+                            wino_blk_size_, output);
+  }
+
+ private:
+  const int wino_blk_size_;
+  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
+};
+
+
+void RegisterBufferInverseTransform(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
+                   BufferInverseTransformOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BufferInverseTransform",
+                   BufferInverseTransformOp, DeviceType::GPU, half);
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/buffer_inverse_transform.h b/mace/kernels/buffer_inverse_transform.h
deleted file mode 100644
index 2b3e0098..00000000
--- a/mace/kernels/buffer_inverse_transform.h
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/kernels/opencl/common.h"
-
-namespace mace {
-namespace kernels {
-
-struct BufferInverseTransformFunctorBase : OpKernel {
-  BufferInverseTransformFunctorBase(OpKernelContext *context,
-                                    const int wino_blk_size)
-    : OpKernel(context),
-      wino_blk_size_(wino_blk_size) {}
-  const int wino_blk_size_;
-};
-
-template <DeviceType D, typename T>
-struct BufferInverseTransformFunctor : BufferInverseTransformFunctorBase {
-  explicit BufferInverseTransformFunctor(OpKernelContext *context,
-                                         const int wino_blk_size)
-    : BufferInverseTransformFunctorBase(context, wino_blk_size) {}
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(input);
-    MACE_UNUSED(type);
-    MACE_UNUSED(output);
-    MACE_UNUSED(future);
-    MACE_NOT_IMPLEMENTED;
-    return MACE_SUCCESS;
-  }
-};
-
-class OpenCLBufferInverseTransformKernel {
- public:
-  virtual MaceStatus Compute(OpKernelContext *context,
-                             const Tensor *input,
-                             const BufferType type,
-                             const int wino_blk_size,
-                             Tensor *output,
-                             StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
-};
-
-template <typename T>
-struct BufferInverseTransformFunctor<DeviceType::GPU, T>
-    : BufferInverseTransformFunctorBase {
-  explicit BufferInverseTransformFunctor(OpKernelContext *context,
-                                         const int wino_blk_size);
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLBufferInverseTransformKernel> kernel_;
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_BUFFER_INVERSE_TRANSFORM_H_
diff --git a/mace/kernels/buffer_transform.cc b/mace/kernels/buffer_transform.cc
new file mode 100644
index 00000000..2b14698c
--- /dev/null
+++ b/mace/kernels/buffer_transform.cc
@@ -0,0 +1,67 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+
+#include "mace/core/operator.h"
+#include "mace/kernels/opencl/buffer/buffer_transform.h"
+#include "mace/kernels/opencl/image/buffer_to_image.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, class T>
+class BufferTransformOp;
+
+template <typename T>
+class BufferTransformOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit BufferTransformOp(OpConstructContext *context)
+      : Operation(context),
+        wino_blk_size_(Operation::GetOptionalArg<int>("wino_block_size", 2)) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::BufferToImage<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::BufferTransform<T>);
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    kernels::BufferType type =
+        static_cast<kernels::BufferType>(Operation::GetOptionalArg<int>(
+            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
+
+    return kernel_->Compute(context, input, type,
+                            wino_blk_size_, output);
+  }
+
+ private:
+  const int wino_blk_size_;
+  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
+};
+
+
+void RegisterBufferTransform(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "BufferTransform",
+                   BufferTransformOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "BufferTransform",
+                   BufferTransformOp, DeviceType::GPU, half);
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/buffer_transform.h b/mace/kernels/buffer_transform.h
deleted file mode 100644
index 8f0fd039..00000000
--- a/mace/kernels/buffer_transform.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_BUFFER_TRANSFORM_H_
-#define MACE_KERNELS_BUFFER_TRANSFORM_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/kernels/opencl/common.h"
-
-namespace mace {
-namespace kernels {
-
-struct BufferTransformFunctorBase : OpKernel {
-  explicit BufferTransformFunctorBase(OpKernelContext *context,
-                                      const int wino_blk_size)
-    : OpKernel(context), wino_blk_size_(wino_blk_size) {}
-  const int wino_blk_size_;
-};
-
-template <DeviceType D, typename T>
-struct BufferTransformFunctor : BufferTransformFunctorBase {
-  BufferTransformFunctor(OpKernelContext *context,
-                         const int wino_blk_size)
-      : BufferTransformFunctorBase(context, wino_blk_size) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(input);
-    MACE_UNUSED(type);
-    MACE_UNUSED(output);
-    MACE_UNUSED(future);
-    MACE_NOT_IMPLEMENTED;
-    return MACE_SUCCESS;
-  }
-};
-
-class OpenCLBufferTransformKernel {
- public:
-  virtual MaceStatus Compute(OpKernelContext *context,
-                             const Tensor *input,
-                             const BufferType type,
-                             const int wino_blk_size,
-                             Tensor *output,
-                             StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLBufferTransformKernel)
-};
-
-template <typename T>
-struct BufferTransformFunctor<DeviceType::GPU, T> : BufferTransformFunctorBase {
-  BufferTransformFunctor(OpKernelContext *context, const int wino_blk_size);
-
-  MaceStatus operator()(const Tensor *input,
-                        const BufferType type,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLBufferTransformKernel> kernel_;
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_BUFFER_TRANSFORM_H_
diff --git a/mace/ops/cast.h b/mace/kernels/cast.cc
similarity index 74%
rename from mace/ops/cast.h
rename to mace/kernels/cast.cc
index 56d20d52..0bd971e1 100644
--- a/mace/ops/cast.h
+++ b/mace/kernels/cast.cc
@@ -12,24 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_CAST_H_
-#define MACE_OPS_CAST_H_
-
-#include <vector>
-
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
 template <DeviceType D, typename SrcType>
-class CastOp : public Operator<D, SrcType> {
+class CastOp : public Operation {
  public:
-  CastOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, SrcType>(op_def, context) {}
+  explicit CastOp(OpConstructContext *context)
+      : Operation(context) {}
 
-  MaceStatus Run(StatsFuture *future) override {
-    MACE_UNUSED(future);
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
     MACE_RETURN_IF_ERROR(output->ResizeLike(input))
@@ -47,7 +42,7 @@ class CastOp : public Operator<D, SrcType> {
 
     MACE_RUN_WITH_TYPE_ENUM(dst_dtype, MACE_CAST_COPY);
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
  private:
@@ -55,7 +50,12 @@ class CastOp : public Operator<D, SrcType> {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-}  // namespace ops
-}  // namespace mace
+void RegisterCast(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Cast", CastOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Cast", CastOp,
+                   DeviceType::CPU, int32_t);
+}
 
-#endif  // MACE_OPS_CAST_H_
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.cc
similarity index 50%
rename from mace/kernels/channel_shuffle.h
rename to mace/kernels/channel_shuffle.cc
index d5cf5fe0..8258ea1c 100644
--- a/mace/kernels/channel_shuffle.h
+++ b/mace/kernels/channel_shuffle.cc
@@ -12,28 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_CHANNEL_SHUFFLE_H_
-#define MACE_KERNELS_CHANNEL_SHUFFLE_H_
-
 #include <memory>
-#include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/channel_shuffle.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct ChannelShuffleFunctor : OpKernel {
-  ChannelShuffleFunctor(OpKernelContext *context, const int groups)
-      : OpKernel(context), groups_(groups) {}
+template <DeviceType D, class T>
+class ChannelShuffleOp;
 
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <typename T>
+class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit ChannelShuffleOp(OpConstructContext *context)
+      : Operation(context),
+        groups_(Operation::GetOptionalArg<int>("group", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim(1) % groups_ == 0,
+               "input channels must be an integral multiple of group. ",
+               input->dim(1));
     MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
     Tensor::MappingGuard logits_guard(input);
@@ -64,35 +69,51 @@ struct ChannelShuffleFunctor : OpKernel {
       }
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   const int groups_;
 };
 
+
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLChannelShuffleKernel {
+template <typename T>
+class ChannelShuffleOp<DeviceType::GPU, T> : public Operation {
  public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLChannelShuffleKernel);
-};
-template<typename T>
-struct ChannelShuffleFunctor<DeviceType::GPU, T> : OpKernel {
-  ChannelShuffleFunctor(OpKernelContext *context, const int groups);
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit ChannelShuffleOp(OpConstructContext *context)
+      : Operation(context) {
+    const int groups = Operation::GetOptionalArg<int>("group", 1);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    return kernel_->Compute(context, input, output);
+  }
 
+ private:
   std::unique_ptr<OpenCLChannelShuffleKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterChannelShuffle(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
+                   ChannelShuffleOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
+                   ChannelShuffleOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ChannelShuffle",
+                   ChannelShuffleOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_CHANNEL_SHUFFLE_H_
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.cc
similarity index 54%
rename from mace/kernels/concat.h
rename to mace/kernels/concat.cc
index 0cb28861..de501192 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.cc
@@ -12,33 +12,54 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_CONCAT_H_
-#define MACE_KERNELS_CONCAT_H_
-
 #include <memory>
-#include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 #include "mace/utils/quantize.h"
 
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/concat.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct ConcatFunctor : OpKernel {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
+class ConcatOpBase : public Operation {
+ public:
+  explicit ConcatOpBase(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
+
+ protected:
+  void Validate() {
+    const int32_t input_dims = this->Input(0)->dim_size();
+    axis_ =
+        axis_ < 0 ? axis_ + input_dims : axis_;
+    MACE_CHECK((0 <= axis_ && axis_ < input_dims),
+               "Expected concatenating axis in the range [", -input_dims, ", ",
+               input_dims, "], but got ", axis_);
+  }
+
+ protected:
+  int axis_;
+};
+
+template <DeviceType D, class T>
+class ConcatOp;
+
+template <typename T>
+class ConcatOp<DeviceType::CPU, T> : public ConcatOpBase {
+ public:
+  explicit ConcatOp(OpConstructContext *context)
+      : ConcatOpBase(context) {}
 
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    const Tensor *input0 = input_list.front();
-    const size_t inputs_count = input_list.size();
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Validate();
+    const std::vector<const Tensor *> &inputs = this->Inputs();
+    Tensor *output = this->Output(0);
+    const Tensor *input0 = inputs.front();
+    const size_t inputs_count = inputs.size();
 
     std::vector<index_t> output_shape(input0->shape());
     index_t inner_size = 1;
@@ -48,7 +69,7 @@ struct ConcatFunctor : OpKernel {
     std::vector<index_t> outer_sizes(inputs_count, 0);
     outer_sizes[0] = input0->size() / inner_size;
     for (size_t i = 1; i < inputs_count; ++i) {
-      const Tensor *input = input_list[i];
+      const Tensor *input = inputs[i];
       MACE_CHECK(input->dim_size() == input0->dim_size(),
                  "Ranks of all input tensors must be same.");
       for (int j = 0; j < input->dim_size(); ++j) {
@@ -65,9 +86,9 @@ struct ConcatFunctor : OpKernel {
 
     T *output_ptr = output->mutable_data<T>();
 
-    std::vector<const T *> input_ptrs(input_list.size(), nullptr);
+    std::vector<const T *> input_ptrs(inputs.size(), nullptr);
     for (size_t i = 0; i < inputs_count; ++i) {
-      input_ptrs[i] = input_list[i]->data<T>();
+      input_ptrs[i] = inputs[i]->data<T>();
     }
     for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
       for (size_t i = 0; i < inputs_count; ++i) {
@@ -83,24 +104,24 @@ struct ConcatFunctor : OpKernel {
       }
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
-
-  int32_t axis_;
 };
 
-template<>
-struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class ConcatOp<DeviceType::CPU, uint8_t> : public ConcatOpBase {
+ public:
+  explicit ConcatOp(OpConstructContext *context)
+      : ConcatOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Validate();
+    const std::vector<const Tensor *> &inputs = this->Inputs();
+    Tensor *output = this->Output(0);
     MACE_CHECK(output->scale() != 0);
-    const Tensor *input0 = input_list.front();
-    const size_t inputs_count = input_list.size();
+    const Tensor *input0 = inputs.front();
+    const size_t inputs_count = inputs.size();
 
     std::vector<index_t> output_shape(input0->shape());
     index_t inner_size = 1;
@@ -110,7 +131,7 @@ struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {
     std::vector<index_t> outer_sizes(inputs_count, 0);
     outer_sizes[0] = input0->size() / inner_size;
     for (size_t i = 1; i < inputs_count; ++i) {
-      const Tensor *input = input_list[i];
+      const Tensor *input = inputs[i];
       MACE_CHECK(input->dim_size() == input0->dim_size(),
                  "Ranks of all input tensors must be same.");
       for (int j = 0; j < input->dim_size(); ++j) {
@@ -127,22 +148,22 @@ struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {
 
     auto output_ptr = output->mutable_data<uint8_t>();
 
-    std::vector<const uint8_t *> input_ptrs(input_list.size(), nullptr);
+    std::vector<const uint8_t *> input_ptrs(inputs.size(), nullptr);
     for (size_t i = 0; i < inputs_count; ++i) {
-      input_ptrs[i] = input_list[i]->data<uint8_t>();
+      input_ptrs[i] = inputs[i]->data<uint8_t>();
     }
 
     for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
       for (size_t i = 0; i < inputs_count; ++i) {
-        if (input_list[i]->zero_point() == output->zero_point()
-            && input_list[i]->scale() == output->scale()) {
+        if (inputs[i]->zero_point() == output->zero_point()
+            && inputs[i]->scale() == output->scale()) {
           memcpy(output_ptr, input_ptrs[i], outer_sizes[i] * sizeof(uint8_t));
           output_ptr += outer_sizes[i];
           input_ptrs[i] += outer_sizes[i];
         } else {
-          const float scale = input_list[i]->scale() / output->scale();
+          const float scale = inputs[i]->scale() / output->scale();
           const float offset =
-              -input_list[i]->zero_point() * scale + output->zero_point();
+              -inputs[i]->zero_point() * scale + output->zero_point();
           for (index_t k = 0; k < outer_sizes[i]; ++k) {
             float out = (*input_ptrs[i]) * scale + offset;
             *output_ptr = Saturate<uint8_t>(roundf(out));
@@ -153,35 +174,49 @@ struct ConcatFunctor<DeviceType::CPU, uint8_t> : OpKernel {
       }
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
-
-  int32_t axis_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLConcatKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const std::vector<const Tensor *> &input_list,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConcatKernel);
-};
 template <typename T>
-struct ConcatFunctor<DeviceType::GPU, T> : OpKernel {
-  ConcatFunctor(OpKernelContext *context, const int32_t axis);
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future);
+class ConcatOp<DeviceType::GPU, T> : public ConcatOpBase {
+ public:
+  explicit ConcatOp(OpConstructContext *context)
+      : ConcatOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ConcatKernel<T>(axis_));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    Validate();
+    Tensor *output = this->Output(0);
+    return kernel_->Compute(context, inputs_, output);
+  }
 
+ private:
   std::unique_ptr<OpenCLConcatKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterConcat(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Concat", ConcatOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_CONCAT_H_
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.cc
similarity index 83%
rename from mace/kernels/conv_2d.h
rename to mace/kernels/conv_2d.cc
index ebd23576..c6edbff6 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_CONV_2D_H_
-#define MACE_KERNELS_CONV_2D_H_
-
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
@@ -22,250 +19,47 @@
 #include <functional>
 #include <limits>
 #include <memory>
+#include <string>
 #include <tuple>
 #include <vector>
 
 #include "mace/core/future.h"
+#include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/activation.h"
-#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/arm/conv_2d_neon.h"
 #include "mace/kernels/arm/conv_winograd.h"
+#include "mace/kernels/conv_pool_2d_base.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/gemmlowp_util.h"
-#include "mace/kernels/quantize.h"
 #include "mace/utils/utils.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/conv_2d.h"
+#include "mace/kernels/opencl/buffer/conv_2d.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-struct Conv2dFunctorBase : OpKernel {
-  Conv2dFunctorBase(OpKernelContext *context,
-                    const int *strides,
-                    const Padding &padding_type,
-                    const std::vector<int> &paddings,
-                    const int *dilations,
-                    const ActivationType activation,
-                    const float relux_max_limit)
-    : OpKernel(context),
-      strides_(strides),
-      padding_type_(padding_type),
-      paddings_(paddings),
-      dilations_(dilations),
-      activation_(activation),
-      relux_max_limit_(relux_max_limit) {}
-
-  const int *strides_;  // [stride_h, stride_w]
-  const Padding padding_type_;
-  std::vector<int> paddings_;
-  const int *dilations_;  // [dilation_h, dilation_w]
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
-
-template<DeviceType D, typename T>
-struct Conv2dFunctor;
-
-template<>
-struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
-  Conv2dFunctor(OpKernelContext *context,
-                const int *strides,
-                const Padding &padding_type,
-                const std::vector<int> &paddings,
-                const int *dilations,
-                const ActivationType activation,
-                const float relux_max_limit)
-    : Conv2dFunctorBase(context,
-                        strides,
-                        padding_type,
-                        paddings,
-                        dilations,
-                        activation,
-                        relux_max_limit),
-      is_filter_transformed_(false) {}
-
-  void Conv2dGeneral(const float *input,
-                     const float *filter,
-                     const index_t *in_shape,
-                     const index_t *out_shape,
-                     const index_t *filter_shape,
-                     const int *stride_hw,
-                     const int *dilation_hw,
-                     float *output) {
-    const index_t in_image_size = in_shape[2] * in_shape[3];
-    const index_t out_image_size = out_shape[2] * out_shape[3];
-    const index_t in_batch_size = filter_shape[1] * in_image_size;
-    const index_t out_batch_size = filter_shape[0] * out_image_size;
-    const index_t filter_size = filter_shape[2] * filter_shape[3];
-
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < in_shape[0]; b++) {
-      for (index_t m = 0; m < filter_shape[0]; m += 4) {
-        const index_t in_width = in_shape[3];
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t out_channels = filter_shape[0];
-        const index_t in_channels = filter_shape[1];
-
-        const int stride_h = stride_hw[0];
-        const int stride_w = stride_hw[1];
-        const int dilation_h = dilation_hw[0];
-        const int dilation_w = dilation_hw[1];
-        if (m + 3 < out_channels) {
-          float *out_ptr0_base =
-              output + b * out_batch_size + m * out_image_size;
-          float *out_ptr1_base = out_ptr0_base + out_image_size;
-          float *out_ptr2_base = out_ptr1_base + out_image_size;
-          float *out_ptr3_base = out_ptr2_base + out_image_size;
-          for (index_t c = 0; c < in_channels; ++c) {
-            const float *in_ptr_base =
-                input + b * in_batch_size + c * in_image_size;
-            const float *filter_ptr0 =
-                filter + m * in_channels * filter_size + c * filter_size;
-            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
-            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
-            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
-            for (index_t h = 0; h < out_height; ++h) {
-              for (index_t w = 0; w + 3 < out_width; w += 4) {
-                 // input offset
-                index_t ih = h * stride_h;
-                index_t iw = w * stride_w;
-                index_t in_offset = ih * in_width + iw;
-                // output (4 outch x 1 height x 4 width): vo_outch_height
-                float vo0[4], vo1[4], vo2[4], vo3[4];
-                // load output
-                index_t out_offset = h * out_width + w;
-                for (index_t ow = 0; ow < 4; ++ow) {
-                   vo0[ow] = out_ptr0_base[out_offset + ow];
-                   vo1[ow] = out_ptr1_base[out_offset + ow];
-                   vo2[ow] = out_ptr2_base[out_offset + ow];
-                   vo3[ow] = out_ptr3_base[out_offset + ow];
-                }
-                // calc by row
-                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                    // outch 0
-                    vo0[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    vo0[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr0[kw];
-                    // outch 1
-                    vo1[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    vo1[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr1[kw];
-                    // outch 2
-                    vo2[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    vo2[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr2[kw];
-                    // outch 3
-                    vo3[0] += in_ptr_base[in_offset
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[1] += in_ptr_base[in_offset + stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[2] += in_ptr_base[in_offset + 2 * stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                    vo3[3] += in_ptr_base[in_offset + 3 * stride_w
-                        + kw * dilation_w] * filter_ptr3[kw];
-                  }  // kw
-
-                  in_offset += dilation_h * in_width;
-                  filter_ptr0 += filter_shape[3];
-                  filter_ptr1 += filter_shape[3];
-                  filter_ptr2 += filter_shape[3];
-                  filter_ptr3 += filter_shape[3];
-                }  // kh
-
-                for (index_t ow = 0; ow < 4; ++ow) {
-                  out_ptr0_base[out_offset + ow] = vo0[ow];
-                  out_ptr1_base[out_offset + ow] = vo1[ow];
-                  out_ptr2_base[out_offset + ow] = vo2[ow];
-                  out_ptr3_base[out_offset + ow] = vo3[ow];
-                }
-
-                filter_ptr0 -= filter_size;
-                filter_ptr1 -= filter_size;
-                filter_ptr2 -= filter_size;
-                filter_ptr3 -= filter_size;
-              }  // w
-            }  // h
-          }  // c
-        } else {
-          for (index_t mm = m; mm < out_channels; ++mm) {
-            float *out_ptr0_base =
-                output + b * out_batch_size + mm * out_image_size;
-            for (index_t c = 0; c < in_channels; ++c) {
-              const float *in_ptr_base =
-                  input + b * in_batch_size + c * in_image_size;
-              const float *filter_ptr0 =
-                  filter + mm * in_channels * filter_size + c * filter_size;
-
-              for (index_t h = 0; h < out_height; ++h) {
-                for (index_t w = 0; w + 3 < out_width; w += 4) {
-                  // input offset
-                  index_t ih = h * stride_h;
-                  index_t iw = w * stride_w;
-                  index_t in_offset = ih * in_width + iw;
-                  // output (1 outch x 1 height x 4 width): vo_outch_height
-                  float vo0[4];
-                  // load output
-                  index_t out_offset = h * out_width + w;
-                  for (index_t ow = 0; ow < 4; ++ow) {
-                     vo0[ow] = out_ptr0_base[out_offset + ow];
-                  }
-
-                  // calc by row
-                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
-                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
-                      // outch 0
-                      vo0[0] += in_ptr_base[in_offset
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[1] += in_ptr_base[in_offset + stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[2] += in_ptr_base[in_offset + 2 * stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                      vo0[3] += in_ptr_base[in_offset + 3 * stride_w
-                          + kw * dilation_w] * filter_ptr0[kw];
-                    }  // kw
-
-                    in_offset += dilation_h * in_width;
-                    filter_ptr0 += filter_shape[3];
-                  }  // kh
-
-                  for (index_t ow = 0; ow < 4; ++ow) {
-                    out_ptr0_base[out_offset + ow] = vo0[ow];
-                  }
-                  filter_ptr0 -= filter_size;
-                }  // w
-              }  // h
-            }  // c
-          }  // mm
-        }  // if
-      }  // m
-    }  // b
-  }
+template <DeviceType D, class T>
+class Conv2dOp;
 
-  MaceStatus operator()(const Tensor *input,   // NCHW
-                        const Tensor *filter,  // OIHW
-                        const Tensor *bias,
-                        Tensor *output,        // NCHW
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    MACE_CHECK_NOTNULL(input);
-    MACE_CHECK_NOTNULL(filter);
-    MACE_CHECK_NOTNULL(output);
+template <>
+class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
+ public:
+  explicit Conv2dOp(OpConstructContext *context)
+      : ConvPool2dOpBase(context),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        is_filter_transformed_(false) {}
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
 
     std::vector<index_t> filter_shape(4);
     filter_shape = filter->shape();
@@ -275,8 +69,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     if (paddings_.empty()) {
       CalcNCHWPaddingAndOutputSize(input->shape().data(),
                                    filter_shape.data(),
-                                   dilations_,
-                                   strides_,
+                                   dilations_.data(),
+                                   strides_.data(),
                                    padding_type_,
                                    output_shape.data(),
                                    paddings.data());
@@ -285,8 +79,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       CalcNCHWOutputSize(input->shape().data(),
                          filter_shape.data(),
                          paddings_.data(),
-                         dilations_,
-                         strides_,
+                         dilations_.data(),
+                         strides_.data(),
                          RoundType::FLOOR,
                          output_shape.data());
     }
@@ -340,15 +134,15 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     std::function<void(const float *input, float *output)> conv_func;
 
     bool
-      use_winograd = filter_h == 3 && filter_w == 3
-      && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
-      && input_channels >= 8 && channels >= 8;
+        use_winograd = filter_h == 3 && filter_w == 3
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1
+        && input_channels >= 8 && channels >= 8;
     bool use_neon_3x3_s1 = filter_h == 3 && filter_w == 3
-      && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
     bool use_neon_3x3_s2 = filter_h == 3 && filter_w == 3
-      && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
+        && stride_h == 2 && stride_w == 2 && dilation_h == 1 && dilation_w == 1;
     bool use_neon_1x1_s1 = filter_h == 1 && filter_w == 1
-      && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
+        && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
     bool use_neon_5x5_s1 = filter_h == 5 && filter_w == 5
         && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1;
     bool use_neon_1x7_s1 = filter_h == 1 && filter_w == 7
@@ -380,7 +174,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     if (use_winograd) {
       extra_output_height = RoundUp<index_t>(height, winograd_out_tile_size);
       extra_input_height =
-        std::max(padded_input_height, extra_output_height + 2);
+          std::max(padded_input_height, extra_output_height + 2);
       extra_output_width = RoundUp<index_t>(width, winograd_out_tile_size);
       extra_input_width = std::max(padded_input_width, extra_output_width + 2);
       if (extra_input_height != padded_input_height) {
@@ -394,7 +188,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       index_t tile_width_count = extra_output_width / winograd_out_tile_size;
       index_t tile_count = tile_height_count * tile_width_count;
       index_t in_tile_area =
-        (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
+          (winograd_out_tile_size + 2) * (winograd_out_tile_size + 2);
 
       transformed_input_shape.insert(transformed_input_shape.end(),
                                      {in_tile_area, batch, input_channels,
@@ -455,17 +249,17 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
       total_scratch_size += transformed_input_size + transformed_output_size;
     }
     if (extra_input_height != input_height
-      || extra_input_width != input_width) {
+        || extra_input_width != input_width) {
       padded_input_size =
-        batch * input_channels * (input_height + pad_top + pad_bottom)
-          * (input_width + pad_left + pad_right) * sizeof(float) +
-            MACE_EXTRA_BUFFER_PAD_SIZE;
+          batch * input_channels * (input_height + pad_top + pad_bottom)
+              * (input_width + pad_left + pad_right) * sizeof(float) +
+              MACE_EXTRA_BUFFER_PAD_SIZE;
       total_scratch_size += padded_input_size;
     }
     if (extra_output_height != height || extra_output_width != width) {
       padded_output_size =
-        batch * channels * extra_output_height * extra_output_width
-          * sizeof(float);
+          batch * channels * extra_output_height * extra_output_width
+              * sizeof(float);
       total_scratch_size += padded_output_size;
     }
     // scratch for sgemm, preoccupy enough buffer
@@ -478,13 +272,13 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     }
 
     // Init scratch buffer
-    ScratchBuffer *scratch = context_->device()->scratch_buffer();
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
     scratch->Rewind();
     scratch->GrowSize(total_scratch_size);
     Tensor
-      transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
+        transformed_input(scratch->Scratch(transformed_input_size), DT_FLOAT);
     Tensor
-      transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT);
+        transformed_output(scratch->Scratch(transformed_output_size), DT_FLOAT);
     Tensor padded_input(scratch->Scratch(padded_input_size), DT_FLOAT);
     Tensor padded_output(scratch->Scratch(padded_output_size), DT_FLOAT);
     const index_t extra_input_shape[4] =
@@ -624,10 +418,10 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     } else if (use_neon_1x15_s1) {
       conv_func = [=](const float *pad_input, float *pad_output) {
         Conv2dNeonK1x15S1(pad_input,
-                         filter_data,
-                         extra_input_shape,
-                         extra_output_shape,
-                         pad_output);
+                          filter_data,
+                          extra_input_shape,
+                          extra_output_shape,
+                          pad_output);
       };
     } else if (use_neon_15x1_s1) {
       conv_func = [=](const float *pad_input, float *pad_output) {
@@ -644,8 +438,8 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
                       extra_input_shape,
                       extra_output_shape,
                       filter_shape.data(),
-                      strides_,
-                      dilations_,
+                      strides_.data(),
+                      dilations_.data(),
                       pad_output);
       };
     }
@@ -653,13 +447,9 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     // pad input and output
     const Tensor *pad_input_ptr = input;
     if (extra_input_height != input_height
-      || extra_input_width != input_width) {
-      MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(input,
-                                            pad_top,
-                                            pad_bottom,
-                                            pad_left,
-                                            pad_right,
-                                            &padded_input));
+        || extra_input_width != input_width) {
+      MACE_RETURN_IF_ERROR(ConstructNCHWInputWithSpecificPadding(
+          input, pad_top, pad_bottom, pad_left, pad_right, &padded_input));
       pad_input_ptr = &padded_input;
     }
 
@@ -667,7 +457,7 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     Tensor *pad_output_ptr = output;
     if (extra_output_height != height || extra_output_width != width) {
       padded_output.Reshape({batch, channels, extra_output_height,
-                            extra_output_width});
+                             extra_output_width});
       padded_output.Clear();
       pad_output_ptr = &padded_output;
     } else if (!use_neon_1x1_s1) {
@@ -686,13 +476,13 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
         for (index_t c = 0; c < channels; ++c) {
           for (index_t h = 0; h < height; ++h) {
             memcpy(
-              output_data + b * channels * height * width + c * height * width
-                + h * width,
-              pad_output_data
-                + b * channels * extra_output_height * extra_output_width
-                + c * extra_output_height * extra_output_width
-                + h * extra_output_width,
-              sizeof(float) * width);
+                output_data + b * channels * height * width + c * height * width
+                    + h * width,
+                pad_output_data
+                    + b * channels * extra_output_height * extra_output_width
+                    + c * extra_output_height * extra_output_width
+                    + h * extra_output_width,
+                sizeof(float) * width);
           }
         }
       }
@@ -727,123 +517,216 @@ struct Conv2dFunctor<DeviceType::CPU, float> : Conv2dFunctorBase {
     DoActivation(output_data, output_data, output->size(), activation_,
                  relux_max_limit_);
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
-  bool is_filter_transformed_;
-  SGemm sgemm_;
-};
-
-template<>
-struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
-  Conv2dFunctor(OpKernelContext *context,
-                const int *strides,
-                const Padding &padding_type,
-                const std::vector<int> &paddings,
-                const int *dilations,
-                const ActivationType activation,
-                const float relux_max_limit)
-      : Conv2dFunctorBase(context,
-                          strides,
-                          padding_type,
-                          paddings,
-                          dilations,
-                          activation,
-                          relux_max_limit) {}
-
-  template <typename T>
-  inline void Im2col(
-      const T *in_data, const std::vector<index_t> &in_shape,
-      const index_t filter_h, const index_t filter_w, const index_t stride_h,
-      const index_t stride_w, const T zero_point, const int pad_height,
-      const int pad_width, const std::vector<index_t> &out_shape,
-      const index_t depth, T* im2col_data) {
-    const index_t input_row_size = in_shape[2] * in_shape[3];
-    const index_t patch_row_size = filter_w * in_shape[3];
-
-#pragma omp parallel for collapse(3)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          // Reshape a patch of input to column, which is corresponding to
-          // a column of output(:, column).
-          const index_t ih_begin = h * stride_h - (pad_height >> 1);
-          const index_t ih_end = ih_begin + filter_h;
-          const index_t iw_begin = w * stride_w - (pad_width >> 1);
-          const index_t iw_end = iw_begin + filter_w;
-          // gate height and width to separate padding
-          const index_t ih_begin_gated = std::max<index_t>(0, ih_begin);
-          const index_t ih_end_gated = std::min<index_t>(ih_end, in_shape[1]);
-          const index_t iw_begin_gated = std::max<index_t>(0, iw_begin);
-          const index_t iw_end_gated = std::min<index_t>(iw_end, in_shape[2]);
-          const index_t pad_top = std::max<index_t>(0, -ih_begin);
-          const index_t pad_bottom = ih_end - ih_end_gated;
-          const index_t pad_left = std::max<index_t>(0, -iw_begin);
-          const index_t pad_right = iw_end - iw_end_gated;
-          index_t im2col_column_offset =
-              ((b * out_shape[1] + h) * out_shape[2] + w) * depth;
+ private:
+  void Conv2dGeneral(const float *input,
+                     const float *filter,
+                     const index_t *in_shape,
+                     const index_t *out_shape,
+                     const index_t *filter_shape,
+                     const int *stride_hw,
+                     const int *dilation_hw,
+                     float *output) {
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = filter_shape[1] * in_image_size;
+    const index_t out_batch_size = filter_shape[0] * out_image_size;
+    const index_t filter_size = filter_shape[2] * filter_shape[3];
 
-          // fill in padding top
-          if (pad_top > 0) {
-            std::fill_n(im2col_data + im2col_column_offset,
-                        pad_top * patch_row_size, zero_point);
-          }
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < in_shape[0]; b++) {
+      for (index_t m = 0; m < filter_shape[0]; m += 4) {
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t out_channels = filter_shape[0];
+        const index_t in_channels = filter_shape[1];
 
-          const index_t patch_row_size_gated =
-              std::min(filter_w - pad_left,
-                       in_shape[2] - iw_begin_gated) * in_shape[3];
-          MACE_CHECK(patch_row_size_gated ==
-              ((filter_w - (pad_left + pad_right)) * in_shape[3]));
-          const index_t pad_left_size = pad_left * in_shape[3];
-          const index_t pad_right_size = pad_right * in_shape[3];
-          index_t im2col_offset = im2col_column_offset +
-              (pad_top * filter_w + pad_left) * in_shape[3];
-          index_t in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2]
-              + iw_begin_gated) * in_shape[3];
+        const int stride_h = stride_hw[0];
+        const int stride_w = stride_hw[1];
+        const int dilation_h = dilation_hw[0];
+        const int dilation_w = dilation_hw[1];
+        if (m + 3 < out_channels) {
+          float *out_ptr0_base =
+              output + b * out_batch_size + m * out_image_size;
+          float *out_ptr1_base = out_ptr0_base + out_image_size;
+          float *out_ptr2_base = out_ptr1_base + out_image_size;
+          float *out_ptr3_base = out_ptr2_base + out_image_size;
+          for (index_t c = 0; c < in_channels; ++c) {
+            const float *in_ptr_base =
+                input + b * in_batch_size + c * in_image_size;
+            const float *filter_ptr0 =
+                filter + m * in_channels * filter_size + c * filter_size;
+            const float *filter_ptr1 = filter_ptr0 + in_channels * filter_size;
+            const float *filter_ptr2 = filter_ptr1 + in_channels * filter_size;
+            const float *filter_ptr3 = filter_ptr2 + in_channels * filter_size;
+            for (index_t h = 0; h < out_height; ++h) {
+              for (index_t w = 0; w + 3 < out_width; w += 4) {
+                // input offset
+                index_t ih = h * stride_h;
+                index_t iw = w * stride_w;
+                index_t in_offset = ih * in_width + iw;
+                // output (4 outch x 1 height x 4 width): vo_outch_height
+                float vo0[4], vo1[4], vo2[4], vo3[4];
+                // load output
+                index_t out_offset = h * out_width + w;
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  vo0[ow] = out_ptr0_base[out_offset + ow];
+                  vo1[ow] = out_ptr1_base[out_offset + ow];
+                  vo2[ow] = out_ptr2_base[out_offset + ow];
+                  vo3[ow] = out_ptr3_base[out_offset + ow];
+                }
+                // calc by row
+                for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                  for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                    // outch 0
+                    vo0[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr0[kw];
+                    // outch 1
+                    vo1[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    vo1[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    vo1[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    vo1[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr1[kw];
+                    // outch 2
+                    vo2[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    vo2[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    vo2[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    vo2[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr2[kw];
+                    // outch 3
+                    vo3[0] += in_ptr_base[in_offset
+                        + kw * dilation_w] * filter_ptr3[kw];
+                    vo3[1] += in_ptr_base[in_offset + stride_w
+                        + kw * dilation_w] * filter_ptr3[kw];
+                    vo3[2] += in_ptr_base[in_offset + 2 * stride_w
+                        + kw * dilation_w] * filter_ptr3[kw];
+                    vo3[3] += in_ptr_base[in_offset + 3 * stride_w
+                        + kw * dilation_w] * filter_ptr3[kw];
+                  }  // kw
 
-          // fill in effective rows
-          for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) {
-            // fill in padding left
-            if (pad_left > 0) {
-              const index_t left_offset = im2col_offset - pad_left_size;
-              std::fill_n(im2col_data + left_offset, pad_left_size, zero_point);
-            }
-            // copy effective data
-            std::copy_n(in_data + in_offset, patch_row_size_gated,
-                        im2col_data + im2col_offset);
-            // fill in padding right
-            if (pad_right > 0) {
-              const index_t right_offset = im2col_offset + patch_row_size_gated;
-              std::fill_n(im2col_data + right_offset, pad_right_size,
-                          zero_point);
-            }
-            in_offset += input_row_size;
-            im2col_offset += patch_row_size;
-          }
+                  in_offset += dilation_h * in_width;
+                  filter_ptr0 += filter_shape[3];
+                  filter_ptr1 += filter_shape[3];
+                  filter_ptr2 += filter_shape[3];
+                  filter_ptr3 += filter_shape[3];
+                }  // kh
 
-          // fill in padding bottom
-          if (pad_bottom > 0) {
-            const index_t pad_bottom_size = pad_bottom * patch_row_size;
-            const index_t bottom_offset =
-                im2col_column_offset + depth - pad_bottom_size;
-            std::fill_n(im2col_data + bottom_offset, pad_bottom_size,
-                        zero_point);
-          }
-        }
-      }
-    }
+                for (index_t ow = 0; ow < 4; ++ow) {
+                  out_ptr0_base[out_offset + ow] = vo0[ow];
+                  out_ptr1_base[out_offset + ow] = vo1[ow];
+                  out_ptr2_base[out_offset + ow] = vo2[ow];
+                  out_ptr3_base[out_offset + ow] = vo3[ow];
+                }
+
+                filter_ptr0 -= filter_size;
+                filter_ptr1 -= filter_size;
+                filter_ptr2 -= filter_size;
+                filter_ptr3 -= filter_size;
+              }  // w
+            }  // h
+          }  // c
+        } else {
+          for (index_t mm = m; mm < out_channels; ++mm) {
+            float *out_ptr0_base =
+                output + b * out_batch_size + mm * out_image_size;
+            for (index_t c = 0; c < in_channels; ++c) {
+              const float *in_ptr_base =
+                  input + b * in_batch_size + c * in_image_size;
+              const float *filter_ptr0 =
+                  filter + mm * in_channels * filter_size + c * filter_size;
+
+              for (index_t h = 0; h < out_height; ++h) {
+                for (index_t w = 0; w + 3 < out_width; w += 4) {
+                  // input offset
+                  index_t ih = h * stride_h;
+                  index_t iw = w * stride_w;
+                  index_t in_offset = ih * in_width + iw;
+                  // output (1 outch x 1 height x 4 width): vo_outch_height
+                  float vo0[4];
+                  // load output
+                  index_t out_offset = h * out_width + w;
+                  for (index_t ow = 0; ow < 4; ++ow) {
+                    vo0[ow] = out_ptr0_base[out_offset + ow];
+                  }
+
+                  // calc by row
+                  for (index_t kh = 0; kh < filter_shape[2]; ++kh) {
+                    for (index_t kw = 0; kw < filter_shape[3]; ++kw) {
+                      // outch 0
+                      vo0[0] += in_ptr_base[in_offset
+                          + kw * dilation_w] * filter_ptr0[kw];
+                      vo0[1] += in_ptr_base[in_offset + stride_w
+                          + kw * dilation_w] * filter_ptr0[kw];
+                      vo0[2] += in_ptr_base[in_offset + 2 * stride_w
+                          + kw * dilation_w] * filter_ptr0[kw];
+                      vo0[3] += in_ptr_base[in_offset + 3 * stride_w
+                          + kw * dilation_w] * filter_ptr0[kw];
+                    }  // kw
+
+                    in_offset += dilation_h * in_width;
+                    filter_ptr0 += filter_shape[3];
+                  }  // kh
+
+                  for (index_t ow = 0; ow < 4; ++ow) {
+                    out_ptr0_base[out_offset + ow] = vo0[ow];
+                  }
+                  filter_ptr0 -= filter_size;
+                }  // w
+              }  // h
+            }  // c
+          }  // mm
+        }  // if
+      }  // m
+    }  // b
   }
 
-  MaceStatus operator()(const Tensor *input,   // NHWC
-                        const Tensor *filter,  // OHWI
-                        const Tensor *bias,
-                        Tensor *output,        // NHWC
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+ private:
+  const ActivationType activation_;
+  const float relux_max_limit_;
+  bool is_filter_transformed_;
+  SGemm sgemm_;
+
+ private:
+  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+
+template <>
+class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
+ public:
+  explicit Conv2dOp(OpConstructContext *context)
+      : ConvPool2dOpBase(context),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+
     MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1,
                "Quantization convolution does not support dilation > 1 yet.");
 
-    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    auto gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext();
     MACE_CHECK_NOTNULL(gemm_context);
 
     std::vector<index_t> output_shape(4);
@@ -853,8 +736,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
                                NHWC,
                                filter->shape().data(),
                                OHWI,
-                               dilations_,
-                               strides_,
+                               dilations_.data(),
+                               strides_.data(),
                                padding_type_,
                                output_shape.data(),
                                paddings.data());
@@ -865,8 +748,8 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
                      filter->shape().data(),
                      OHWI,
                      paddings_.data(),
-                     dilations_,
-                     strides_,
+                     dilations_.data(),
+                     strides_.data(),
                      RoundType::FLOOR,
                      output_shape.data());
     }
@@ -916,7 +799,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
     bool im2col_required =
         filter_h != 1 || filter_w != 1 || stride_h != 1 || stride_w != 1;
     total_scratch_size += (im2col_required ? im2col_size : 0);
-    ScratchBuffer *scratch = context_->device()->scratch_buffer();
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
     scratch->Rewind();
     scratch->GrowSize(total_scratch_size);
 
@@ -965,50 +848,156 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
         gemm_context, filter_matrix, input_matrix, &output_matrix,
         -filter->zero_point(), -input->zero_point(), output_pipeline);
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
-};
 
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLConv2dKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      const ActivationType activation,
-      const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLConv2dKernel);
+ private:
+  template <typename T>
+  inline void Im2col(
+      const T *in_data, const std::vector<index_t> &in_shape,
+      const index_t filter_h, const index_t filter_w, const index_t stride_h,
+      const index_t stride_w, const T zero_point, const int pad_height,
+      const int pad_width, const std::vector<index_t> &out_shape,
+      const index_t depth, T* im2col_data) {
+    const index_t input_row_size = in_shape[2] * in_shape[3];
+    const index_t patch_row_size = filter_w * in_shape[3];
+
+#pragma omp parallel for collapse(3)
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t h = 0; h < out_shape[1]; ++h) {
+        for (index_t w = 0; w < out_shape[2]; ++w) {
+          // Reshape a patch of input to column, which is corresponding to
+          // a column of output(:, column).
+          const index_t ih_begin = h * stride_h - (pad_height >> 1);
+          const index_t ih_end = ih_begin + filter_h;
+          const index_t iw_begin = w * stride_w - (pad_width >> 1);
+          const index_t iw_end = iw_begin + filter_w;
+          // gate height and width to separate padding
+          const index_t ih_begin_gated = std::max<index_t>(0, ih_begin);
+          const index_t ih_end_gated = std::min<index_t>(ih_end, in_shape[1]);
+          const index_t iw_begin_gated = std::max<index_t>(0, iw_begin);
+          const index_t iw_end_gated = std::min<index_t>(iw_end, in_shape[2]);
+          const index_t pad_top = std::max<index_t>(0, -ih_begin);
+          const index_t pad_bottom = ih_end - ih_end_gated;
+          const index_t pad_left = std::max<index_t>(0, -iw_begin);
+          const index_t pad_right = iw_end - iw_end_gated;
+          index_t im2col_column_offset =
+              ((b * out_shape[1] + h) * out_shape[2] + w) * depth;
+
+          // fill in padding top
+          if (pad_top > 0) {
+            std::fill_n(im2col_data + im2col_column_offset,
+                        pad_top * patch_row_size, zero_point);
+          }
+
+          const index_t patch_row_size_gated =
+              std::min(filter_w - pad_left,
+                       in_shape[2] - iw_begin_gated) * in_shape[3];
+          MACE_CHECK(patch_row_size_gated ==
+              ((filter_w - (pad_left + pad_right)) * in_shape[3]));
+          const index_t pad_left_size = pad_left * in_shape[3];
+          const index_t pad_right_size = pad_right * in_shape[3];
+          index_t im2col_offset = im2col_column_offset +
+              (pad_top * filter_w + pad_left) * in_shape[3];
+          index_t in_offset = ((b * in_shape[1] + ih_begin_gated) * in_shape[2]
+              + iw_begin_gated) * in_shape[3];
+
+          // fill in effective rows
+          for (index_t ih = ih_begin_gated; ih < ih_end_gated; ++ih) {
+            // fill in padding left
+            if (pad_left > 0) {
+              const index_t left_offset = im2col_offset - pad_left_size;
+              std::fill_n(im2col_data + left_offset, pad_left_size, zero_point);
+            }
+            // copy effective data
+            std::copy_n(in_data + in_offset, patch_row_size_gated,
+                        im2col_data + im2col_offset);
+            // fill in padding right
+            if (pad_right > 0) {
+              const index_t right_offset = im2col_offset + patch_row_size_gated;
+              std::fill_n(im2col_data + right_offset, pad_right_size,
+                          zero_point);
+            }
+            in_offset += input_row_size;
+            im2col_offset += patch_row_size;
+          }
+
+          // fill in padding bottom
+          if (pad_bottom > 0) {
+            const index_t pad_bottom_size = pad_bottom * patch_row_size;
+            const index_t bottom_offset =
+                im2col_column_offset + depth - pad_bottom_size;
+            std::fill_n(im2col_data + bottom_offset, pad_bottom_size,
+                        zero_point);
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  const ActivationType activation_;
+  const float relux_max_limit_;
+
+ private:
+  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
+#ifdef MACE_ENABLE_OPENCL
 template <typename T>
-struct Conv2dFunctor<DeviceType::GPU, T> : Conv2dFunctorBase {
-  Conv2dFunctor(OpKernelContext *context,
-                const int *strides,
-                const Padding &padding_type,
-                const std::vector<int> &paddings,
-                const int *dilations,
-                const ActivationType activation,
-                const float relux_max_limit);
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *filter,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future);
+class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
+ public:
+  explicit Conv2dOp(OpConstructContext *context)
+      : ConvPool2dOpBase(context),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::Conv2dKernel<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+    return kernel_->Compute(context, input, filter, bias,
+                            strides_.data(), padding_type_, paddings_,
+                            dilations_.data(), activation_, relux_max_limit_,
+                            output);
+  }
 
+ private:
+  const ActivationType activation_;
+  const float relux_max_limit_;
   std::unique_ptr<OpenCLConv2dKernel> kernel_;
+
+ private:
+  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterConv2D(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
+                   DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
+                   DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Conv2D", Conv2dOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_CONV_2D_H_
diff --git a/mace/ops/conv_pool_2d_base.h b/mace/kernels/conv_pool_2d_base.h
similarity index 59%
rename from mace/ops/conv_pool_2d_base.h
rename to mace/kernels/conv_pool_2d_base.h
index 0a8a8c17..d1e59c61 100644
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/kernels/conv_pool_2d_base.h
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_CONV_POOL_2D_BASE_H_
-#define MACE_OPS_CONV_POOL_2D_BASE_H_
+#ifndef MACE_KERNELS_CONV_POOL_2D_BASE_H_
+#define MACE_KERNELS_CONV_POOL_2D_BASE_H_
 
 #include <vector>
 
@@ -21,18 +21,17 @@
 #include "mace/kernels/conv_pool_2d_util.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
-template <DeviceType D, class T>
-class ConvPool2dOpBase : public Operator<D, T> {
+class ConvPool2dOpBase : public Operation {
  public:
-  ConvPool2dOpBase(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        strides_(OperatorBase::GetRepeatedArgs<int>("strides")),
-        padding_type_(static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
+  explicit ConvPool2dOpBase(OpConstructContext *context)
+      : Operation(context),
+        strides_(Operation::GetRepeatedArgs<int>("strides")),
+        padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
             "padding", static_cast<int>(SAME)))),
-        paddings_(OperatorBase::GetRepeatedArgs<int>("padding_values")),
-        dilations_(OperatorBase::GetRepeatedArgs<int>("dilations", {1, 1})) {}
+        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
+        dilations_(Operation::GetRepeatedArgs<int>("dilations", {1, 1})) {}
 
  protected:
   std::vector<int> strides_;
@@ -41,7 +40,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
   std::vector<int> dilations_;
 };
 
-}  // namespace ops
+}  // namespace kernels
 }  // namespace mace
 
-#endif  // MACE_OPS_CONV_POOL_2D_BASE_H_
+#endif  // MACE_KERNELS_CONV_POOL_2D_BASE_H_
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc
index ce9fb39c..c4669f4c 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -362,7 +362,7 @@ MaceStatus ConstructNCHWInputWithPadding(const Tensor *input_tensor,
     }
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
@@ -408,7 +408,7 @@ MaceStatus ConstructNCHWInputWithSpecificPadding(const Tensor *input_tensor,
     }
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 
@@ -460,7 +460,7 @@ MaceStatus ConstructNHWCInputWithPadding(const Tensor *input_tensor,
     }
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/crop.h b/mace/kernels/crop.cc
similarity index 63%
rename from mace/kernels/crop.h
rename to mace/kernels/crop.cc
index 0838b69a..6b1ffa6a 100644
--- a/mace/kernels/crop.h
+++ b/mace/kernels/crop.cc
@@ -12,65 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_CROP_H_
-#define MACE_KERNELS_CROP_H_
-
 #include <memory>
-#include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/crop.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct CropFunctor : OpKernel {
-  CropFunctor(OpKernelContext *context,
-              const int axis,
-              const std::vector<int> &offset)
-      : OpKernel(context),
-        axis_(axis),
-        offset_(offset) {}
-
-  void crop_copy(const T* input_data, T* output_data,
-                 const std::vector<index_t> &input_shape,
-                 const std::vector<index_t> &output_shape,
-                 const int32_t* offsets) {
-    const index_t out_img_size =
-        output_shape[1] * output_shape[2] * output_shape[3];
-    const index_t out_hw = output_shape[2] * output_shape[3];
-    const index_t in_img_size =
-        input_shape[1] * input_shape[2] * input_shape[3];
-    const index_t in_hw = input_shape[2] * input_shape[3];
-#pragma omp parallel for collapse(3)
-    for (int b = 0; b < output_shape[0]; ++b) {
-      for (int c = 0; c < output_shape[1]; ++c) {
-        for (int h = 0; h < output_shape[2]; ++h) {
-          T* out_ptr =
-              output_data + b * out_img_size + c * out_hw + h * output_shape[3];
-          const T* in_ptr_bch =
-              input_data + (b + offsets[0]) * in_img_size +
-                  (c + offsets[1]) * in_hw +
-                  (h + offsets[2]) * input_shape[3] + offsets[3];
-          memcpy(out_ptr, in_ptr_bch,
-                 output_shape[3] * sizeof(T));
-        }
-      }
-    }
-  }
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
-
-    MACE_CHECK(input_list.size() == 2, "Crop op needs two inputs.");
-    const Tensor *input0 = input_list[0];
-    const Tensor *input1 = input_list[1];
+template <DeviceType D, class T>
+class CropOp : public Operation {
+ public:
+  explicit CropOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 2)),
+        offset_(Operation::GetRepeatedArgs<int>("offset")) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    MACE_CHECK(inputs_.size() == 2, "Crop op needs two inputs.");
+    Tensor *output = this->Output(0);
+    const Tensor *input0 = inputs_[0];
+    const Tensor *input1 = inputs_[1];
     const uint32_t in0_dims = static_cast<uint32_t >(input0->dim_size());
     const uint32_t in1_dims = static_cast<uint32_t >(input0->dim_size());
 
@@ -91,8 +56,8 @@ struct CropFunctor : OpKernel {
           crop_offset = offset_[i - axis_];
         }
         MACE_CHECK(input0->dim(i) - crop_offset >= input1->dim(i))
-        << "the crop for dimension" << i << "is out of bound with size"
-        << input1->dim(i) << "and offset" << crop_offset;
+          << "the crop for dimension" << i << "is out of bound with size"
+          << input1->dim(i) << "and offset" << crop_offset;
       }
       output_shape[i] = new_size;
       offsets[i] = crop_offset;
@@ -105,37 +70,78 @@ struct CropFunctor : OpKernel {
     crop_copy(input_data, output_data, input0->shape(),
               output_shape, offsets.data());
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  void crop_copy(const T* input_data, T* output_data,
+                 const std::vector<index_t> &input_shape,
+                 const std::vector<index_t> &output_shape,
+                 const int32_t* offsets) {
+    const index_t out_img_size =
+        output_shape[1] * output_shape[2] * output_shape[3];
+    const index_t out_hw = output_shape[2] * output_shape[3];
+    const index_t in_img_size =
+        input_shape[1] * input_shape[2] * input_shape[3];
+    const index_t in_hw = input_shape[2] * input_shape[3];
+#pragma omp parallel for collapse(3)
+    for (int b = 0; b < output_shape[0]; ++b) {
+      for (int c = 0; c < output_shape[1]; ++c) {
+        for (int h = 0; h < output_shape[2]; ++h) {
+          T* out_ptr =
+              output_data + b * out_img_size + c * out_hw + h * output_shape[3];
+          const T* in_ptr_bch =
+              input_data + (b + offsets[0]) * in_img_size +
+                  (c + offsets[1]) * in_hw +
+                  (h + offsets[2]) * input_shape[3] + offsets[3];
+          memcpy(out_ptr, in_ptr_bch,
+                 output_shape[3] * sizeof(T));
+        }
+      }
+    }
   }
 
+ private:
   const int axis_;
   std::vector<int> offset_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLCropKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const std::vector<const Tensor *> &input_list,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLCropKernel);
-};
 template <typename T>
-struct CropFunctor<DeviceType::GPU, T> : OpKernel {
-  CropFunctor(OpKernelContext *context,
-              const int axis,
-              const std::vector<int> &offset);
-
-  MaceStatus operator()(const std::vector<const Tensor *> &input_list,
-                        Tensor *output,
-                        StatsFuture *future);
+class CropOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit CropOp(OpConstructContext *context)
+      : Operation(context) {
+    const int axis = Operation::GetOptionalArg<int>("axis", 2);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::CropKernel<T>(
+          axis, Operation::GetRepeatedArgs<int>("offset")));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    return kernel_->Compute(context, inputs_, this->Output(0));
+  }
+
+ private:
   std::unique_ptr<OpenCLCropKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterCrop(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Crop", CropOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_CROP_H_
diff --git a/mace/kernels/deconv_2d.cc b/mace/kernels/deconv_2d.cc
new file mode 100644
index 00000000..44c0c119
--- /dev/null
+++ b/mace/kernels/deconv_2d.cc
@@ -0,0 +1,561 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/deconv_2d.h"
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/future.h"
+#include "mace/core/operator.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/arm/deconv_2d_neon.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/utils/utils.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/deconv_2d.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+class Deconv2dOpBase : public Operation {
+ public:
+  explicit Deconv2dOpBase(OpConstructContext *context)
+      : Operation(context),
+        strides_(Operation::GetRepeatedArgs<int>("strides")),
+        padding_type_(static_cast<Padding>(Operation::GetOptionalArg<int>(
+            "padding", static_cast<int>(SAME)))),
+        paddings_(Operation::GetRepeatedArgs<int>("padding_values")),
+        model_type_(static_cast<kernels::FrameworkType>(
+                        Operation::GetOptionalArg<int>("framework_type", 0))),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+
+
+  static void CalcDeconvOutputSize(
+      const index_t *input_shape,   // NHWC
+      const index_t *filter_shape,  // OIHW
+      const int *strides,
+      index_t *output_shape,
+      const int *padding_size,
+      int *input_padding,
+      const bool isNCHW = false) {
+    MACE_CHECK_NOTNULL(output_shape);
+    MACE_CHECK_NOTNULL(padding_size);
+    MACE_CHECK_NOTNULL(input_shape);
+    MACE_CHECK_NOTNULL(filter_shape);
+    MACE_CHECK_NOTNULL(strides);
+
+    const index_t output_channel = filter_shape[0];
+
+    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
+    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
+
+    const index_t kernel_h = filter_shape[2];
+    const index_t kernel_w = filter_shape[3];
+
+    input_padding[0] = static_cast<int>((kernel_h -1) * 2 - padding_size[0]);
+    input_padding[1] = static_cast<int>((kernel_w -1) * 2 - padding_size[1]);
+    input_padding[0] = std::max<int>(0, input_padding[0]);
+    input_padding[1] = std::max<int>(0, input_padding[1]);
+
+    index_t out_height =
+        (in_height - 1) * strides[0] + kernel_h - padding_size[0];
+    index_t out_width =
+        (in_width - 1) * strides[1] + kernel_w - padding_size[1];
+
+    output_shape[0] = input_shape[0];
+    if (isNCHW) {
+      output_shape[1] = output_channel;
+      output_shape[2] = out_height;
+      output_shape[3] = out_width;
+    } else {
+      output_shape[1] = out_height;
+      output_shape[2] = out_width;
+      output_shape[3] = output_channel;
+    }
+  }
+
+  static void CalcDeconvPaddingAndInputSize(
+      const index_t *input_shape,   // NHWC
+      const index_t *filter_shape,  // OIHW
+      const int *strides,
+      Padding padding,
+      const index_t *output_shape,
+      int *padding_size,
+      const bool isNCHW = false) {
+    MACE_CHECK_NOTNULL(output_shape);
+    MACE_CHECK_NOTNULL(padding_size);
+    MACE_CHECK_NOTNULL(input_shape);
+    MACE_CHECK_NOTNULL(filter_shape);
+    MACE_CHECK_NOTNULL(strides);
+
+    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
+    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
+
+    const index_t out_height = isNCHW ? output_shape[2] : output_shape[1];
+    const index_t out_width = isNCHW ? output_shape[3] : output_shape[2];
+
+    const index_t extended_input_height = (in_height - 1) * strides[0] + 1;
+    const index_t extended_input_width = (in_width - 1) * strides[1] + 1;
+
+    const index_t filter_h = filter_shape[2];
+    const index_t filter_w = filter_shape[3];
+
+    index_t expected_input_height = 0, expected_input_width = 0;
+
+    switch (padding) {
+      case VALID:
+        expected_input_height =
+            (out_height - filter_h + strides[0]) / strides[0];
+        expected_input_width =
+            (out_width - filter_w + strides[1]) / strides[1];
+        break;
+      case SAME:
+        expected_input_height =
+            (out_height + strides[0] - 1) / strides[0];
+        expected_input_width =
+            (out_width + strides[1] - 1) / strides[1];
+        break;
+      default:
+        MACE_CHECK(false, "Unsupported padding type: ", padding);
+    }
+
+    MACE_CHECK(expected_input_height == in_height,
+               expected_input_height, "!=", in_height);
+    MACE_CHECK(expected_input_width == in_width,
+               expected_input_width, "!=", in_width);
+
+    const int p_h = static_cast<int>(out_height +
+        filter_h - 1 - extended_input_height);
+    const int p_w = static_cast<int>(out_width +
+        filter_w - 1 - extended_input_width);
+
+    padding_size[0] = std::max<int>(0, p_h);
+    padding_size[1] = std::max<int>(0, p_w);
+  }
+
+ protected:
+  std::vector<int> strides_;  // [stride_h, stride_w]
+  const Padding padding_type_;
+  std::vector<int> paddings_;
+  const FrameworkType model_type_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
+};
+
+template <DeviceType D, class T>
+class Deconv2dOp;
+
+template <>
+class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
+ public:
+  explicit Deconv2dOp(OpConstructContext *context)
+      : Deconv2dOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *filter = this->Input(1);
+    const Tensor *bias = nullptr;
+    const Tensor *output_shape_tensor = nullptr;
+    if (model_type_ == kernels::CAFFE) {
+      bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
+    } else {
+      output_shape_tensor =
+          this->InputSize() >= 3 ? this->Input(2) : nullptr;
+      bias = this->InputSize() >= 4 ? this->Input(3) : nullptr;
+    }
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK_NOTNULL(input);
+    MACE_CHECK_NOTNULL(filter);
+    MACE_CHECK_NOTNULL(output);
+
+    std::vector<int> paddings(2);
+    std::vector<int> out_paddings(2);
+    std::vector<index_t> output_shape(4);
+    if (model_type_ == FrameworkType::TENSORFLOW) {  // tensorflow
+      paddings = std::vector<int>(2, 0);
+      MACE_CHECK_NOTNULL(output_shape_tensor);
+      MACE_CHECK(output_shape_tensor->size() == 4);
+      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
+      auto output_shape_data =
+          output_shape_tensor->data<int32_t>();
+      output_shape =
+          std::vector<index_t>(output_shape_data, output_shape_data + 4);
+
+      const index_t t = output_shape[1];
+      output_shape[1] = output_shape[3];
+      output_shape[3] = output_shape[2];
+      output_shape[2] = t;
+
+      CalcDeconvPaddingAndInputSize(
+          input->shape().data(),
+          filter->shape().data(),
+          strides_.data(), padding_type_,
+          output_shape.data(),
+          paddings.data(), true);
+    } else {  // caffe
+      out_paddings = paddings_;
+      output_shape = std::vector<index_t>(4, 0);
+      CalcDeconvOutputSize(input->shape().data(),
+                           filter->shape().data(),
+                           strides_.data(),
+                           output_shape.data(),
+                           out_paddings.data(),
+                           paddings.data(),
+                           true);
+    }
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    index_t kernel_h = filter->dim(2);
+    index_t kernel_w = filter->dim(3);
+    const index_t *in_shape = input->shape().data();
+
+    MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ",
+               output_shape[1]);
+    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
+               in_shape[1]);
+    MACE_CHECK(in_shape[0] == output_shape[0],
+               "Input/Output batch size mismatch");
+    std::function<void(const float *input,
+                       const float *filter,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output)> deconv_func;
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard filter_mapper(filter);
+    Tensor::MappingGuard bias_mapper(bias);
+    Tensor::MappingGuard output_mapper(output);
+    auto input_data = input->data<float>();
+    auto filter_data = filter->data<float>();
+    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
+    auto output_data = output->mutable_data<float>();
+
+    const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h;
+    const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w;
+    const index_t pad_h = (padded_out_h - output_shape[2]) / 2;
+    const index_t pad_w = (padded_out_w - output_shape[3]) / 2;
+
+    std::vector<index_t> padded_out_shape({output_shape[0], output_shape[1],
+                                           padded_out_h, padded_out_w});
+    index_t padded_out_size =
+        std::accumulate(padded_out_shape.begin(),
+                        padded_out_shape.end(),
+                        1,
+                        std::multiplies<index_t>()) * sizeof(float);
+    ScratchBuffer *scratch = context->device()->scratch_buffer();
+    scratch->Rewind();
+    scratch->GrowSize(padded_out_size);
+    Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
+    padded_out.Reshape(padded_out_shape);
+    padded_out.Clear();
+    auto *padded_out_data = padded_out.mutable_data<float>();
+
+    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
+        strides_[0] == strides_[1] && strides_[0] == 1;
+    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
+        strides_[0] == strides_[1] && strides_[0] == 2;
+
+    bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
+        strides_[0] == strides_[1] && strides_[0] == 1;
+    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
+        strides_[0] == strides_[1] && strides_[0] == 2;
+
+    if (use_neon_3x3_s1) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK3x3S1(input,
+                           filter,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else if (use_neon_3x3_s2) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK3x3S2(input,
+                           filter,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else if (use_neon_4x4_s1) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK4x4S1(input,
+                           filter,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else if (use_neon_4x4_s2) {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dNeonK4x4S2(input,
+                           filter,
+                           in_shape,
+                           padded_out_shape,
+                           padded_output);
+      };
+    } else {
+      deconv_func = [=](const float *input,
+                        const float *filter,
+                        const index_t *in_shape,
+                        const index_t *padded_out_shape,
+                        float *padded_output) {
+        Deconv2dGeneral(input,
+                        filter,
+                        kernel_h,
+                        kernel_w,
+                        strides_.data(),
+                        in_shape,
+                        padded_out_shape,
+                        padded_output);
+      };
+    }
+
+    bool no_pad =
+        padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
+    float *out_data = no_pad ? output_data : padded_out_data;
+
+    deconv_func(input_data,
+                filter_data,
+                in_shape,
+                padded_out_shape.data(),
+                out_data);
+    if (!no_pad) {
+      CropPadOut(out_data,
+                 padded_out_shape.data(),
+                 output_shape.data(),
+                 pad_h,
+                 pad_w,
+                 output_data);
+    }
+
+    if (bias_data != nullptr) {
+      const index_t batch = output_shape[0];
+      const index_t channels = output_shape[1];
+      const index_t img_size = output_shape[2] * output_shape[3];
+#pragma omp parallel for collapse(3)
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t i = 0; i < img_size; ++i) {
+            output_data[(b * channels + c) * img_size + i] +=
+                bias_data[c];
+          }
+        }
+      }
+    }
+
+    DoActivation<float>(output_data,
+                        output_data,
+                        output->size(),
+                        activation_,
+                        relux_max_limit_);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  void Deconv2dGeneral(const float *input,
+                       const float *filter,
+                       const index_t kernel_h,
+                       const index_t kernel_w,
+                       const int *strides,
+                       const index_t *in_shape,
+                       const index_t *out_shape,
+                       float *output) {
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+    const index_t out_img_size = out_height * out_width;
+    const index_t in_img_size = in_height * in_width;
+
+    const int kernel_size = static_cast<int>(kernel_h * kernel_w);
+    std::vector<index_t> index_map(kernel_size, 0);
+    for (index_t i = 0; i < kernel_h; ++i) {
+      for (index_t j = 0; j < kernel_w; ++j) {
+        index_map[i * kernel_w + j] = i * out_width + j;
+      }
+    }
+
+    const index_t batch = in_shape[0];
+    const index_t out_channels = out_shape[1];
+    const index_t in_channels = in_shape[1];
+
+#pragma omp parallel for collapse(2)
+    for (int b = 0; b < batch; ++b) {
+      for (int oc = 0; oc < out_channels; ++oc) {
+        float *out_base =
+            output + (b * out_channels + oc) * out_img_size;
+        for (int i = 0; i < in_height; ++i) {
+          for (int j = 0; j < in_width; ++j) {
+            const index_t out_offset =
+                i * strides[0] * out_width + j * strides[1];
+            for (int ic = 0; ic < in_channels; ++ic) {
+              const index_t input_idx =
+                  (b * in_channels + ic) * in_img_size + i * in_width + j;
+              const float val = input[input_idx];
+              const index_t kernel_offset =
+                  (oc * in_channels + ic) * kernel_size;
+              for (int k = 0; k < kernel_size; ++k) {
+                const index_t out_idx = out_offset + index_map[k];
+                const index_t kernel_idx = kernel_offset + k;
+                out_base[out_idx] += val * filter[kernel_idx];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void CropPadOut(const float *input,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const index_t pad_h,
+                  const index_t pad_w,
+                  float *output) {
+    const index_t batch = in_shape[0];
+    const index_t channel = in_shape[1];
+    const index_t in_height = in_shape[2];
+    const index_t in_width = in_shape[3];
+
+    const index_t out_height = out_shape[2];
+    const index_t out_width = out_shape[3];
+#pragma omp parallel for collapse(3)
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channel; ++j) {
+        for (int k = 0; k < out_height; ++k) {
+          const float *input_base =
+              input + ((i * channel + j) * in_height + (k + pad_h)) * in_width;
+          float *output_base =
+              output + ((i * channel + j) * out_height + k)* out_width;
+          memcpy(output_base, input_base + pad_w, out_width * sizeof(float));
+        }
+      }
+    }
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
+ public:
+  explicit Deconv2dOp(OpConstructContext *context)
+      : Deconv2dOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::Deconv2dKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    const Tensor *filter = this->Input(1);
+    const Tensor *bias = nullptr;
+    const Tensor *output_shape_tensor = nullptr;
+    if (model_type_ == kernels::CAFFE) {
+      bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
+    } else {
+      output_shape_tensor =
+          this->InputSize() >= 3 ? this->Input(2) : nullptr;
+      bias = this->InputSize() >= 4 ? this->Input(3) : nullptr;
+    }
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK_NOTNULL(input);
+    MACE_CHECK_NOTNULL(filter);
+    MACE_CHECK_NOTNULL(output);
+    std::vector<int> paddings(2);
+    std::vector<int> out_paddings(2);
+    std::vector<index_t> output_shape(4);
+    if (model_type_ == FrameworkType::TENSORFLOW) {
+      paddings = std::vector<int>(2, 0);
+      MACE_CHECK_NOTNULL(output_shape_tensor);
+      MACE_CHECK(output_shape_tensor->size() == 4);
+      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
+      auto output_shape_data =
+          output_shape_tensor->data<int32_t>();
+      output_shape =
+          std::vector<index_t>(output_shape_data, output_shape_data + 4);
+      CalcDeconvPaddingAndInputSize(input->shape().data(),
+                                    filter->shape().data(),
+                                    strides_.data(),
+                                    padding_type_,
+                                    output_shape.data(),
+                                    paddings.data());
+    } else {
+      out_paddings = paddings_;
+      paddings = std::vector<int>(2, 0);
+      output_shape = std::vector<index_t>(4, 0);
+      CalcDeconvOutputSize(input->shape().data(),
+                           filter->shape().data(),
+                           strides_.data(),
+                           output_shape.data(),
+                           out_paddings.data(),
+                           paddings.data());
+    }
+
+    return kernel_->Compute(context, input, filter, bias,
+                            strides_.data(), paddings.data(), activation_,
+                            relux_max_limit_, output_shape, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterDeconv2D(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Deconv2D", Deconv2dOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index 7e1ed460..25413d98 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -15,22 +15,6 @@
 #ifndef MACE_KERNELS_DECONV_2D_H_
 #define MACE_KERNELS_DECONV_2D_H_
 
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/arm/deconv_2d_neon.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/utils/utils.h"
-
 namespace mace {
 namespace kernels {
 
@@ -39,483 +23,6 @@ enum FrameworkType {
   CAFFE = 1,
 };
 
-struct Deconv2dFunctorBase : OpKernel {
-  Deconv2dFunctorBase(OpKernelContext *context,
-                      const std::vector<int> &strides,
-                      const Padding &padding_type,
-                      const std::vector<int> &paddings,
-                      const FrameworkType model_type,
-                      const ActivationType activation,
-                      const float relux_max_limit)
-      : OpKernel(context),
-        strides_(strides),
-        padding_type_(padding_type),
-        paddings_(paddings),
-        model_type_(model_type),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
-
-  static void CalcDeconvOutputSize(
-      const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // OIHW
-      const int *strides,
-      index_t *output_shape,
-      const int *padding_size,
-      int *input_padding,
-      const bool isNCHW = false) {
-    MACE_CHECK_NOTNULL(output_shape);
-    MACE_CHECK_NOTNULL(padding_size);
-    MACE_CHECK_NOTNULL(input_shape);
-    MACE_CHECK_NOTNULL(filter_shape);
-    MACE_CHECK_NOTNULL(strides);
-
-    const index_t output_channel = filter_shape[0];
-
-    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
-    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
-
-    const index_t kernel_h = filter_shape[2];
-    const index_t kernel_w = filter_shape[3];
-
-    input_padding[0] = static_cast<int>((kernel_h -1) * 2 - padding_size[0]);
-    input_padding[1] = static_cast<int>((kernel_w -1) * 2 - padding_size[1]);
-    input_padding[0] = std::max<int>(0, input_padding[0]);
-    input_padding[1] = std::max<int>(0, input_padding[1]);
-
-    index_t out_height =
-        (in_height - 1) * strides[0] + kernel_h - padding_size[0];
-    index_t out_width =
-        (in_width - 1) * strides[1] + kernel_w - padding_size[1];
-
-    output_shape[0] = input_shape[0];
-    if (isNCHW) {
-      output_shape[1] = output_channel;
-      output_shape[2] = out_height;
-      output_shape[3] = out_width;
-    } else {
-      output_shape[1] = out_height;
-      output_shape[2] = out_width;
-      output_shape[3] = output_channel;
-    }
-  }
-
-  static void CalcDeconvPaddingAndInputSize(
-      const index_t *input_shape,   // NHWC
-      const index_t *filter_shape,  // OIHW
-      const int *strides,
-      Padding padding,
-      const index_t *output_shape,
-      int *padding_size,
-      const bool isNCHW = false) {
-    MACE_CHECK_NOTNULL(output_shape);
-    MACE_CHECK_NOTNULL(padding_size);
-    MACE_CHECK_NOTNULL(input_shape);
-    MACE_CHECK_NOTNULL(filter_shape);
-    MACE_CHECK_NOTNULL(strides);
-
-    const index_t in_height = isNCHW ? input_shape[2] : input_shape[1];
-    const index_t in_width = isNCHW ? input_shape[3] : input_shape[2];
-
-    const index_t out_height = isNCHW ? output_shape[2] : output_shape[1];
-    const index_t out_width = isNCHW ? output_shape[3] : output_shape[2];
-
-    const index_t extended_input_height = (in_height - 1) * strides[0] + 1;
-    const index_t extended_input_width = (in_width - 1) * strides[1] + 1;
-
-    const index_t filter_h = filter_shape[2];
-    const index_t filter_w = filter_shape[3];
-
-    index_t expected_input_height = 0, expected_input_width = 0;
-
-    switch (padding) {
-      case VALID:
-        expected_input_height =
-            (out_height - filter_h + strides[0]) / strides[0];
-        expected_input_width =
-            (out_width - filter_w + strides[1]) / strides[1];
-        break;
-      case SAME:
-        expected_input_height =
-            (out_height + strides[0] - 1) / strides[0];
-        expected_input_width =
-            (out_width + strides[1] - 1) / strides[1];
-        break;
-      default:
-        MACE_CHECK(false, "Unsupported padding type: ", padding);
-    }
-
-    MACE_CHECK(expected_input_height == in_height,
-               expected_input_height, "!=", in_height);
-    MACE_CHECK(expected_input_width == in_width,
-               expected_input_width, "!=", in_width);
-
-    const int p_h = static_cast<int>(out_height +
-        filter_h - 1 - extended_input_height);
-    const int p_w = static_cast<int>(out_width +
-        filter_w - 1 - extended_input_width);
-
-    padding_size[0] = std::max<int>(0, p_h);
-    padding_size[1] = std::max<int>(0, p_w);
-  }
-
-  std::vector<int> strides_;  // [stride_h, stride_w]
-  const Padding padding_type_;
-  std::vector<int> paddings_;
-  const FrameworkType model_type_;
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
-
-
-template<DeviceType D, typename T>
-struct Deconv2dFunctor;
-
-template<>
-struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
-  Deconv2dFunctor(OpKernelContext *context,
-                  const std::vector<int> &strides,
-                  const Padding &padding_type,
-                  const std::vector<int> &paddings,
-                  const FrameworkType model_type,
-                  const ActivationType activation,
-                  const float relux_max_limit)
-      : Deconv2dFunctorBase(context,
-                            strides,
-                            padding_type,
-                            paddings,
-                            model_type,
-                            activation,
-                            relux_max_limit) {}
-
-  void Deconv2dGeneral(const float *input,
-                       const float *filter,
-                       const index_t kernel_h,
-                       const index_t kernel_w,
-                       const int *strides,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output) {
-    const index_t out_height = out_shape[2];
-    const index_t out_width = out_shape[3];
-    const index_t in_height = in_shape[2];
-    const index_t in_width = in_shape[3];
-    const index_t out_img_size = out_height * out_width;
-    const index_t in_img_size = in_height * in_width;
-
-    const int kernel_size = static_cast<int>(kernel_h * kernel_w);
-    std::vector<index_t> index_map(kernel_size, 0);
-    for (index_t i = 0; i < kernel_h; ++i) {
-      for (index_t j = 0; j < kernel_w; ++j) {
-        index_map[i * kernel_w + j] = i * out_width + j;
-      }
-    }
-
-    const index_t batch = in_shape[0];
-    const index_t out_channels = out_shape[1];
-    const index_t in_channels = in_shape[1];
-
-#pragma omp parallel for collapse(2)
-    for (int b = 0; b < batch; ++b) {
-      for (int oc = 0; oc < out_channels; ++oc) {
-        float *out_base =
-            output + (b * out_channels + oc) * out_img_size;
-        for (int i = 0; i < in_height; ++i) {
-          for (int j = 0; j < in_width; ++j) {
-            const index_t out_offset =
-                i * strides[0] * out_width + j * strides[1];
-            for (int ic = 0; ic < in_channels; ++ic) {
-              const index_t input_idx =
-                  (b * in_channels + ic) * in_img_size + i * in_width + j;
-              const float val = input[input_idx];
-              const index_t kernel_offset =
-                  (oc * in_channels + ic) * kernel_size;
-              for (int k = 0; k < kernel_size; ++k) {
-                const index_t out_idx = out_offset + index_map[k];
-                const index_t kernel_idx = kernel_offset + k;
-                out_base[out_idx] += val * filter[kernel_idx];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void CropPadOut(const float *input,
-                  const index_t *in_shape,
-                  const index_t *out_shape,
-                  const index_t pad_h,
-                  const index_t pad_w,
-                  float *output) {
-    const index_t batch = in_shape[0];
-    const index_t channel = in_shape[1];
-    const index_t in_height = in_shape[2];
-    const index_t in_width = in_shape[3];
-
-    const index_t out_height = out_shape[2];
-    const index_t out_width = out_shape[3];
-#pragma omp parallel for collapse(3)
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channel; ++j) {
-        for (int k = 0; k < out_height; ++k) {
-          const float *input_base =
-              input + ((i * channel + j) * in_height + (k + pad_h)) * in_width;
-          float *output_base =
-              output + ((i * channel + j) * out_height + k)* out_width;
-          memcpy(output_base, input_base + pad_w, out_width * sizeof(float));
-        }
-      }
-    }
-  }
-
-  MaceStatus operator()(const Tensor *input,   // NCHW
-                  const Tensor *filter,  // OIHW
-                  const Tensor *bias,
-                  const Tensor *output_shape_tensor,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
-    MACE_CHECK_NOTNULL(input);
-    MACE_CHECK_NOTNULL(filter);
-    MACE_CHECK_NOTNULL(output);
-
-    std::vector<int> paddings(2);
-    std::vector<int> out_paddings(2);
-    std::vector<index_t> output_shape(4);
-    if (model_type_ == FrameworkType::TENSORFLOW) {  // tensorflow
-      paddings = std::vector<int>(2, 0);
-      MACE_CHECK_NOTNULL(output_shape_tensor);
-      MACE_CHECK(output_shape_tensor->size() == 4);
-      Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
-      auto output_shape_data =
-          output_shape_tensor->data<int32_t>();
-      output_shape =
-          std::vector<index_t>(output_shape_data, output_shape_data + 4);
-
-      const index_t t = output_shape[1];
-      output_shape[1] = output_shape[3];
-      output_shape[3] = output_shape[2];
-      output_shape[2] = t;
-
-      CalcDeconvPaddingAndInputSize(
-          input->shape().data(),
-          filter->shape().data(),
-          strides_.data(), padding_type_,
-          output_shape.data(),
-          paddings.data(), true);
-    } else {  // caffe
-      out_paddings = paddings_;
-      output_shape = std::vector<index_t>(4, 0);
-      CalcDeconvOutputSize(input->shape().data(),
-                           filter->shape().data(),
-                           strides_.data(),
-                           output_shape.data(),
-                           out_paddings.data(),
-                           paddings.data(),
-                           true);
-    }
-    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-    index_t kernel_h = filter->dim(2);
-    index_t kernel_w = filter->dim(3);
-    const index_t *in_shape = input->shape().data();
-
-    MACE_CHECK(filter->dim(0) == output_shape[1], filter->dim(0), " != ",
-               output_shape[1]);
-    MACE_CHECK(filter->dim(1) == in_shape[1], filter->dim(1), " != ",
-               in_shape[1]);
-    MACE_CHECK(in_shape[0] == output_shape[0],
-               "Input/Output batch size mismatch");
-    std::function<void(const float *input,
-                       const float *filter,
-                       const index_t *in_shape,
-                       const index_t *out_shape,
-                       float *output)> deconv_func;
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard filter_mapper(filter);
-    Tensor::MappingGuard bias_mapper(bias);
-    Tensor::MappingGuard output_mapper(output);
-    auto input_data = input->data<float>();
-    auto filter_data = filter->data<float>();
-    auto bias_data = bias == nullptr ? nullptr : bias->data<float>();
-    auto output_data = output->mutable_data<float>();
-
-    const index_t padded_out_h = (in_shape[2] - 1) * strides_[0] + kernel_h;
-    const index_t padded_out_w = (in_shape[3] - 1) * strides_[1] + kernel_w;
-    const index_t pad_h = (padded_out_h - output_shape[2]) / 2;
-    const index_t pad_w = (padded_out_w - output_shape[3]) / 2;
-
-    std::vector<index_t> padded_out_shape({output_shape[0], output_shape[1],
-                                           padded_out_h, padded_out_w});
-    index_t padded_out_size =
-        std::accumulate(padded_out_shape.begin(),
-                        padded_out_shape.end(),
-                        1,
-                        std::multiplies<index_t>()) * sizeof(float);
-    ScratchBuffer *scratch = context_->device()->scratch_buffer();
-    scratch->Rewind();
-    scratch->GrowSize(padded_out_size);
-    Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
-    padded_out.Reshape(padded_out_shape);
-    padded_out.Clear();
-    auto *padded_out_data = padded_out.mutable_data<float>();
-
-    bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_3x3_s2 = kernel_h == kernel_w && kernel_h == 3 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
-
-    bool use_neon_4x4_s1 = kernel_h == kernel_w && kernel_h == 4 &&
-        strides_[0] == strides_[1] && strides_[0] == 1;
-    bool use_neon_4x4_s2 = kernel_h == kernel_w && kernel_h == 4 &&
-        strides_[0] == strides_[1] && strides_[0] == 2;
-
-    if (use_neon_3x3_s1) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        Deconv2dNeonK3x3S1(input,
-                           filter,
-                           in_shape,
-                           padded_out_shape,
-                           padded_output);
-      };
-    } else if (use_neon_3x3_s2) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        Deconv2dNeonK3x3S2(input,
-                           filter,
-                           in_shape,
-                           padded_out_shape,
-                           padded_output);
-      };
-    } else if (use_neon_4x4_s1) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        Deconv2dNeonK4x4S1(input,
-                           filter,
-                           in_shape,
-                           padded_out_shape,
-                           padded_output);
-      };
-    } else if (use_neon_4x4_s2) {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        Deconv2dNeonK4x4S2(input,
-                           filter,
-                           in_shape,
-                           padded_out_shape,
-                           padded_output);
-      };
-    } else {
-      deconv_func = [=](const float *input,
-                        const float *filter,
-                        const index_t *in_shape,
-                        const index_t *padded_out_shape,
-                        float *padded_output) {
-        Deconv2dGeneral(input,
-                        filter,
-                        kernel_h,
-                        kernel_w,
-                        strides_.data(),
-                        in_shape,
-                        padded_out_shape,
-                        padded_output);
-      };
-    }
-
-    bool no_pad =
-        padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
-    float *out_data = no_pad ? output_data : padded_out_data;
-
-    deconv_func(input_data,
-                filter_data,
-                in_shape,
-                padded_out_shape.data(),
-                out_data);
-    if (!no_pad) {
-      CropPadOut(out_data,
-                 padded_out_shape.data(),
-                 output_shape.data(),
-                 pad_h,
-                 pad_w,
-                 output_data);
-    }
-
-    if (bias_data != nullptr) {
-      const index_t batch = output_shape[0];
-      const index_t channels = output_shape[1];
-      const index_t img_size = output_shape[2] * output_shape[3];
-#pragma omp parallel for collapse(3)
-      for (index_t b = 0; b < batch; ++b) {
-        for (index_t c = 0; c < channels; ++c) {
-          for (index_t i = 0; i < img_size; ++i) {
-            output_data[(b * channels + c) * img_size + i] +=
-                bias_data[c];
-          }
-        }
-      }
-    }
-
-    DoActivation<float>(output_data,
-                 output_data,
-                 output->size(),
-                 activation_,
-                 relux_max_limit_);
-
-    return MACE_SUCCESS;
-  }
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLDeconv2dKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const int *padding_data,
-      const ActivationType activation,
-      const float relux_max_limit,
-      const std::vector<index_t> &output_shape,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDeconv2dKernel);
-};
-template <typename T>
-struct Deconv2dFunctor<DeviceType::GPU, T> : Deconv2dFunctorBase {
-  Deconv2dFunctor(OpKernelContext *context,
-                  const std::vector<int> &strides,
-                  const Padding &padding_type,
-                  const std::vector<int> &paddings,
-                  const FrameworkType model_type,
-                  const ActivationType activation,
-                  const float relux_max_limit);
-
-  MaceStatus operator()(const Tensor *input,
-                  const Tensor *filter,
-                  const Tensor *bias,
-                  const Tensor *output_shape_tensor,
-                  Tensor *output,
-                  StatsFuture *future);
-
-  std::unique_ptr<OpenCLDeconv2dKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.cc
similarity index 62%
rename from mace/kernels/depth_to_space.h
rename to mace/kernels/depth_to_space.cc
index e73dec76..cd10b2b0 100644
--- a/mace/kernels/depth_to_space.h
+++ b/mace/kernels/depth_to_space.cc
@@ -12,32 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_DEPTH_TO_SPACE_H_
-#define MACE_KERNELS_DEPTH_TO_SPACE_H_
 #include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
-
+#include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/depth_to_space.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct DepthToSpaceOpFunctor : OpKernel {
-  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size)
-      : OpKernel(context), block_size_(block_size) {}
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <DeviceType D, class T>
+class DepthToSpaceOp : public Operation {
+ public:
+  explicit DepthToSpaceOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
     const index_t batch_size = input->dim(0);
     const index_t input_depth = input->dim(1);
     const index_t input_height = input->dim(2);
@@ -85,36 +82,50 @@ struct DepthToSpaceOpFunctor : OpKernel {
       }
     }
 
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   const int block_size_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLDepthToSpaceKernel {
+template <typename T>
+class DepthToSpaceOp<DeviceType::GPU, T> : public Operation {
  public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthToSpaceKernel);
-};
-template<typename T>
-struct DepthToSpaceOpFunctor<DeviceType::GPU, T> : OpKernel {
-  DepthToSpaceOpFunctor(OpKernelContext *context,
-                        const int block_size);
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit DepthToSpaceOp(OpConstructContext *context)
+      : Operation(context) {
+    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    return kernel_->Compute(context, input, output);
+  }
 
+ private:
   std::unique_ptr<OpenCLDepthToSpaceKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+void RegisterDepthToSpace(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "DepthToSpace",
+                   DepthToSpaceOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_DEPTH_TO_SPACE_H_
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.cc
similarity index 74%
rename from mace/kernels/depthwise_conv2d.h
rename to mace/kernels/depthwise_conv2d.cc
index a7765b30..74def6cf 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.cc
@@ -12,14 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_DEPTHWISE_CONV2D_H_
-#define MACE_KERNELS_DEPTHWISE_CONV2D_H_
-
 #if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
 #include <arm_neon.h>
 #endif
 #include <algorithm>
 #include <memory>
+#include <string>
 #include <vector>
 
 // We reuse TensorFlow Lite's optimized depthwiseconv_uint8 and parallelized it
@@ -27,120 +25,51 @@
 #include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
 
 #include "mace/core/future.h"
-#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/core/operator.h"
 #include "mace/kernels/activation.h"
 #include "mace/kernels/arm/depthwise_conv2d_neon.h"
-#include "mace/kernels/quantize.h"
+#include "mace/kernels/conv_pool_2d_base.h"
 #include "mace/public/mace.h"
-
+#include "mace/utils/quantize.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/kernels/opencl/image/depthwise_conv2d.h"
+#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
 #endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-struct DepthwiseConv2dFunctorBase : OpKernel {
-  DepthwiseConv2dFunctorBase(OpKernelContext *context,
-                             const int *strides,
-                             const Padding padding_type,
-                             const std::vector<int> &paddings,
-                             const int *dilations,
-                             const ActivationType activation,
-                             const float relux_max_limit)
-    : OpKernel(context),
-      strides_(strides),
-      padding_type_(padding_type),
-      paddings_(paddings),
-      dilations_(dilations),
-      activation_(activation),
-      relux_max_limit_(relux_max_limit) {}
-
-  const int *strides_;  // [stride_h, stride_w]
-  const Padding padding_type_;
-  std::vector<int> paddings_;
-  const int *dilations_;  // [dilation_h, dilation_w]
+class DepthwiseConv2dOpBase : public ConvPool2dOpBase {
+ public:
+  explicit DepthwiseConv2dOpBase(OpConstructContext *context)
+      : ConvPool2dOpBase(context),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+ protected:
   const ActivationType activation_;
   const float relux_max_limit_;
 };
 
-template<DeviceType D, typename T>
-struct DepthwiseConv2dFunctor;
-
-template<>
-struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
-  : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(OpKernelContext *context,
-                         const int *strides,
-                         const Padding padding_type,
-                         const std::vector<int> &paddings,
-                         const int *dilations,
-                         const ActivationType activation,
-                         const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(context,
-                                 strides,
-                                 padding_type,
-                                 paddings,
-                                 dilations,
-                                 activation,
-                                 relux_max_limit) {}
-
-  void DepthwiseConv2dGeneral(const float *input,
-                              const float *filter,
-                              const index_t *in_shape,
-                              const index_t *out_shape,
-                              const index_t *filter_shape,
-                              const int *stride_hw,
-                              const int *dilation_hw,
-                              const int *pad_hw,
-                              float *output) {
-    const index_t multiplier = filter_shape[0] / filter_shape[1];
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < in_shape[0]; ++b) {
-      for (index_t m = 0; m < filter_shape[0]; ++m) {
-        for (index_t h = 0; h < out_shape[2]; ++h) {
-          for (index_t w = 0; w < out_shape[3]; ++w) {
-            const index_t out_channels = filter_shape[0];
-            const index_t in_channels = filter_shape[1];
-            const index_t filter_height = filter_shape[2];
-            const index_t filter_width = filter_shape[3];
-            const index_t in_height = in_shape[2];
-            const index_t in_width = in_shape[3];
-            const index_t out_height = out_shape[2];
-            const index_t out_width = out_shape[3];
-            index_t out_offset =
-              ((b * out_channels + m) * out_height + h) * out_width + w;
-            index_t c = m / multiplier;
-            index_t o = m % multiplier;
-            float sum = 0;
-            for (index_t kh = 0; kh < filter_height; ++kh) {
-              for (index_t kw = 0; kw < filter_width; ++kw) {
-                index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
-                index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
-                if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
-                  index_t in_offset =
-                    ((b * in_channels + c) * in_height + ih) * in_width + iw;
-                  index_t filter_offset =
-                    (((o * in_channels) + c) * filter_height + kh)
-                        * filter_width + kw;
+template <DeviceType D, class T>
+class DepthwiseConv2dOp;
 
-                  sum += input[in_offset] * filter[filter_offset];
-                }
-              }
-            }
-            output[out_offset] = sum;
-          }
-        }
-      }
+template <>
+class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
+ public:
+  explicit DepthwiseConv2dOp(OpConstructContext *context)
+      : DepthwiseConv2dOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = nullptr;
+    if (this->InputSize() >= 3) {
+      bias = this->Input(BIAS);
     }
-  }
-
-  MaceStatus operator()(const Tensor *input,   // NCHW
-                        const Tensor *filter,  // OIHW
-                        const Tensor *bias,
-                        Tensor *output,        // NCHW
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+    Tensor *output = this->Output(OUTPUT);
     MACE_CHECK_NOTNULL(input);
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
@@ -148,14 +77,14 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
     std::vector<index_t> output_shape(4);
     std::vector<int> paddings(2);
     std::vector<index_t> filter_shape
-      {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2),
-       filter->dim(3)};
+        {filter->dim(0) * filter->dim(1), filter->dim(1), filter->dim(2),
+         filter->dim(3)};
 
     if (paddings_.empty()) {
       CalcNCHWPaddingAndOutputSize(input->shape().data(),
                                    filter_shape.data(),
-                                   dilations_,
-                                   strides_,
+                                   dilations_.data(),
+                                   strides_.data(),
                                    padding_type_,
                                    output_shape.data(),
                                    paddings.data());
@@ -164,8 +93,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
       CalcNCHWOutputSize(input->shape().data(),
                          filter_shape.data(),
                          paddings_.data(),
-                         dilations_,
-                         strides_,
+                         dilations_.data(),
+                         strides_.data(),
                          RoundType::FLOOR,
                          output_shape.data());
     }
@@ -230,7 +159,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
     MACE_UNUSED(input_shape);
 
     if (filter_h == 3 && filter_w == 3 && stride_h == 1 && stride_w == 1
-      && dilation_h == 1 && dilation_w == 1) {
+        && dilation_h == 1 && dilation_w == 1) {
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dNeonK3x3S1(input,
                                   filter_data,
@@ -244,7 +173,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
                                   output);
       };
     } else if (filter_h == 3 && filter_w == 3 && stride_h == 2 && stride_w == 2
-      && dilation_h == 1 && dilation_w == 1) {
+        && dilation_h == 1 && dilation_w == 1) {
       conv_func = [=](const float *input, float *output) {
         DepthwiseConv2dNeonK3x3S2(input,
                                   filter_data,
@@ -264,8 +193,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
                                input_shape,
                                output_shape.data(),
                                filter_shape.data(),
-                               strides_,
-                               dilations_,
+                               strides_.data(),
+                               dilations_.data(),
                                pad_hw,
                                output);
       };
@@ -279,7 +208,7 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
         for (index_t c = 0; c < channels; ++c) {
           for (index_t i = 0; i < height * width; ++i) {
             output_data[(b * channels + c) * height * width + i] +=
-              bias_data[c];
+                bias_data[c];
           }
         }
       }
@@ -288,115 +217,81 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, float>
     DoActivation(output_data, output_data, output->size(), activation_,
                  relux_max_limit_);
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
-};
-
-template<>
-struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
-    : public DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(OpKernelContext *context,
-                         const int *strides,
-                         const Padding padding_type,
-                         const std::vector<int> &paddings,
-                         const int *dilations,
-                         const ActivationType activation,
-                         const float relux_max_limit)
-      : DepthwiseConv2dFunctorBase(context,
-                                   strides,
-                                   padding_type,
-                                   paddings,
-                                   dilations,
-                                   activation,
-                                   relux_max_limit) {}
 
-  void DepthwiseConv2dGeneral(const uint8_t *input,
-                              const uint8_t *filter,
-                              const int32_t *bias,
+ private:
+  void DepthwiseConv2dGeneral(const float *input,
+                              const float *filter,
                               const index_t *in_shape,
                               const index_t *out_shape,
                               const index_t *filter_shape,
-                              const int32_t input_zero,
-                              const int32_t filter_zero,
-                              const int32_t output_zero,
-                              const float output_multiplier,
                               const int *stride_hw,
                               const int *dilation_hw,
                               const int *pad_hw,
-                              uint8_t *output) {
+                              float *output) {
+    const index_t multiplier = filter_shape[0] / filter_shape[1];
 #pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          for (index_t m = 0; m < out_shape[3]; ++m) {
-            const index_t filter_height = filter_shape[0];
-            const index_t filter_width = filter_shape[1];
-            const index_t in_channels = filter_shape[2];
-            const index_t depth_multiplier = filter_shape[3];
-            const index_t in_height = in_shape[1];
-            const index_t in_width = in_shape[2];
-            const index_t out_height = out_shape[1];
-            const index_t out_width = out_shape[2];
-            const index_t out_channels = out_shape[3];
+    for (index_t b = 0; b < in_shape[0]; ++b) {
+      for (index_t m = 0; m < filter_shape[0]; ++m) {
+        for (index_t h = 0; h < out_shape[2]; ++h) {
+          for (index_t w = 0; w < out_shape[3]; ++w) {
+            const index_t out_channels = filter_shape[0];
+            const index_t in_channels = filter_shape[1];
+            const index_t filter_height = filter_shape[2];
+            const index_t filter_width = filter_shape[3];
+            const index_t in_height = in_shape[2];
+            const index_t in_width = in_shape[3];
+            const index_t out_height = out_shape[2];
+            const index_t out_width = out_shape[3];
             index_t out_offset =
-                ((b * out_height + h) * out_width + w) * out_channels + m;
-            index_t c = m / depth_multiplier;
-            index_t o = m % depth_multiplier;
-            index_t ih_base = h * stride_hw[0] - pad_hw[0];
-            index_t iw_base = w * stride_hw[1] - pad_hw[1];
-            int32_t sum = 0;
+                ((b * out_channels + m) * out_height + h) * out_width + w;
+            index_t c = m / multiplier;
+            index_t o = m % multiplier;
+            float sum = 0;
             for (index_t kh = 0; kh < filter_height; ++kh) {
-              const index_t ih = ih_base + kh * dilation_hw[0];
               for (index_t kw = 0; kw < filter_width; ++kw) {
-                const index_t iw = iw_base + kw * dilation_hw[1];
+                index_t ih = h * stride_hw[0] + kh * dilation_hw[0] - pad_hw[0];
+                index_t iw = w * stride_hw[1] + kw * dilation_hw[1] - pad_hw[1];
                 if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
                   index_t in_offset =
-                      ((b * in_height + ih) * in_width + iw) * in_channels + c;
+                      ((b * in_channels + c) * in_height + ih) * in_width + iw;
                   index_t filter_offset =
-                      ((kh * filter_width + kw) * in_channels + c)
-                          * depth_multiplier + o;
+                      (((o * in_channels) + c) * filter_height + kh)
+                          * filter_width + kw;
 
-                  sum += (input[in_offset] - input_zero) *
-                      (filter[filter_offset] - filter_zero);
+                  sum += input[in_offset] * filter[filter_offset];
                 }
               }
             }
-            if (bias) {
-              sum += bias[m];
-            }
-            sum = static_cast<int32_t>(std::round(sum * output_multiplier));
-            sum += output_zero;
-            output[out_offset] =
-                static_cast<uint8_t>(std::min(255, std::max(0, sum)));
+            output[out_offset] = sum;
           }
         }
       }
     }
   }
 
-  inline tflite::Dims<4> ShapeToTfliteDims(const std::vector<index_t> &shape) {
-    tflite::Dims<4> d;
-    for (int i = 0; i < 4; ++i) {
-      int src = static_cast<int>(shape.size() - i - 1);
-      if (src >= 0) {
-        d.sizes[i] = shape[src];
-      } else {
-        d.sizes[i] = 1;
-      }
-    }
-    d.strides[0] = 1;
-    for (int i = 1; i < 4; i++) {
-      d.strides[i] = d.strides[i - 1] * d.sizes[i - 1];
-    }
-    return d;
-  }
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
 
-  MaceStatus operator()(const Tensor *input,   // NHWC
-                        const Tensor *filter,  // HWIM
-                        const Tensor *bias,
-                        Tensor *output,        // NHWC
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class DepthwiseConv2dOp<DeviceType::CPU, uint8_t>
+    : public DepthwiseConv2dOpBase {
+ public:
+  explicit DepthwiseConv2dOp(OpConstructContext *context)
+      : DepthwiseConv2dOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = nullptr;
+    if (this->InputSize() >= 3) {
+      bias = this->Input(BIAS);
+    }
+    Tensor *output = this->Output(OUTPUT);
     MACE_CHECK_NOTNULL(input);
     MACE_CHECK_NOTNULL(filter);
     MACE_CHECK_NOTNULL(output);
@@ -412,8 +307,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
                                NHWC,
                                ohwi_shape.data(),
                                OHWI,
-                               dilations_,
-                               strides_,
+                               dilations_.data(),
+                               strides_.data(),
                                padding_type_,
                                output_shape.data(),
                                paddings.data());
@@ -424,8 +319,8 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
                      ohwi_shape.data(),
                      OHWI,
                      paddings_.data(),
-                     dilations_,
-                     strides_,
+                     dilations_.data(),
+                     strides_.data(),
                      RoundType::FLOOR,
                      output_shape.data());
     }
@@ -493,54 +388,149 @@ struct DepthwiseConv2dFunctor<DeviceType::CPU, uint8_t>
           input_data, filter_data, bias_data, input->shape().data(),
           output_shape.data(), filter->shape().data(), input->zero_point(),
           filter->zero_point(), output->zero_point(), output_multiplier,
-          strides_, dilations_, pad_hw, output_data);
+          strides_.data(), dilations_.data(), pad_hw, output_data);
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
+ private:
+  void DepthwiseConv2dGeneral(const uint8_t *input,
+                              const uint8_t *filter,
+                              const int32_t *bias,
+                              const index_t *in_shape,
+                              const index_t *out_shape,
+                              const index_t *filter_shape,
+                              const int32_t input_zero,
+                              const int32_t filter_zero,
+                              const int32_t output_zero,
+                              const float output_multiplier,
+                              const int *stride_hw,
+                              const int *dilation_hw,
+                              const int *pad_hw,
+                              uint8_t *output) {
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t h = 0; h < out_shape[1]; ++h) {
+        for (index_t w = 0; w < out_shape[2]; ++w) {
+          for (index_t m = 0; m < out_shape[3]; ++m) {
+            const index_t filter_height = filter_shape[0];
+            const index_t filter_width = filter_shape[1];
+            const index_t in_channels = filter_shape[2];
+            const index_t depth_multiplier = filter_shape[3];
+            const index_t in_height = in_shape[1];
+            const index_t in_width = in_shape[2];
+            const index_t out_height = out_shape[1];
+            const index_t out_width = out_shape[2];
+            const index_t out_channels = out_shape[3];
+            index_t out_offset =
+                ((b * out_height + h) * out_width + w) * out_channels + m;
+            index_t c = m / depth_multiplier;
+            index_t o = m % depth_multiplier;
+            index_t ih_base = h * stride_hw[0] - pad_hw[0];
+            index_t iw_base = w * stride_hw[1] - pad_hw[1];
+            int32_t sum = 0;
+            for (index_t kh = 0; kh < filter_height; ++kh) {
+              const index_t ih = ih_base + kh * dilation_hw[0];
+              for (index_t kw = 0; kw < filter_width; ++kw) {
+                const index_t iw = iw_base + kw * dilation_hw[1];
+                if (ih >= 0 && ih < in_height && iw >= 0 && iw < in_width) {
+                  index_t in_offset =
+                      ((b * in_height + ih) * in_width + iw) * in_channels + c;
+                  index_t filter_offset =
+                      ((kh * filter_width + kw) * in_channels + c)
+                          * depth_multiplier + o;
+
+                  sum += (input[in_offset] - input_zero) *
+                      (filter[filter_offset] - filter_zero);
+                }
+              }
+            }
+            if (bias) {
+              sum += bias[m];
+            }
+            sum = static_cast<int32_t>(std::round(sum * output_multiplier));
+            sum += output_zero;
+            output[out_offset] =
+                static_cast<uint8_t>(std::min(255, std::max(0, sum)));
+          }
+        }
+      }
+    }
+  }
+
+  inline tflite::Dims<4> ShapeToTfliteDims(const std::vector<index_t> &shape) {
+    tflite::Dims<4> d;
+    for (int i = 0; i < 4; ++i) {
+      int src = static_cast<int>(shape.size() - i - 1);
+      if (src >= 0) {
+        d.sizes[i] = shape[src];
+      } else {
+        d.sizes[i] = 1;
+      }
+    }
+    d.strides[0] = 1;
+    for (int i = 1; i < 4; i++) {
+      d.strides[i] = d.strides[i - 1] * d.sizes[i - 1];
+    }
+    return d;
+  }
+
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLDepthwiseConv2dKernel {
+template <typename T>
+class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
  public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *filter,
-      const Tensor *bias,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      const ActivationType activation,
-      const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
-};
-
-template<typename T>
-struct DepthwiseConv2dFunctor<DeviceType::GPU, T>
-  : DepthwiseConv2dFunctorBase {
-  DepthwiseConv2dFunctor(OpKernelContext *context,
-                         const int *strides,
-                         const Padding padding_type,
-                         const std::vector<int> &paddings,
-                         const int *dilations,
-                         const ActivationType activation,
-                         const float relux_max_limit);
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *filter,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit DepthwiseConv2dOp(OpConstructContext *context)
+      : DepthwiseConv2dOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *filter = this->Input(FILTER);
+    const Tensor *bias = nullptr;
+    if (this->InputSize() >= 3) {
+      bias = this->Input(BIAS);
+    }
+    Tensor *output = this->Output(OUTPUT);
+    return kernel_->Compute(context, input, filter, bias,
+                            strides_.data(), padding_type_, paddings_,
+                            dilations_.data(), activation_, relux_max_limit_,
+                            output);
+  }
 
+ private:
   std::unique_ptr<OpenCLDepthwiseConv2dKernel> kernel_;
+
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterDepthwiseConv2d(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
+                   DepthwiseConv2dOp, DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
+                   DepthwiseConv2dOp, DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
+                   DepthwiseConv2dOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "DepthwiseConv2d",
+                   DepthwiseConv2dOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_DEPTHWISE_CONV2D_H_
diff --git a/mace/kernels/eltwise.cc b/mace/kernels/eltwise.cc
new file mode 100644
index 00000000..e33006ea
--- /dev/null
+++ b/mace/kernels/eltwise.cc
@@ -0,0 +1,1125 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/eltwise.h"
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "mace/core/future.h"
+#include "mace/core/operator.h"
+#include "mace/core/tensor.h"
+#include "mace/utils/quantize.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/eltwise.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+
+inline index_t GetIndex(const std::vector<index_t> &shape,
+                        const std::vector<index_t> &index) {
+  index_t idx = 0;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] > 1) {
+      idx = idx * shape[i] + index[i];
+    }
+  }
+  return idx;
+}
+
+inline void IncreaseIndex(const std::vector<index_t> &shape,
+                          std::vector<index_t> *index) {
+  for (index_t i = static_cast<index_t>(shape.size()) - 1; i >= 0; --i) {
+    ++(*index)[i];
+    if ((*index)[i] >= shape[i]) {
+      (*index)[i] -= shape[i];
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename T, typename DstType>
+inline void TensorGeneralBroadcastEltwise(
+    const EltwiseType type,
+    const T *input0,
+    const T *input1,
+    const std::vector<float> &coeff,
+    const bool swapped,
+    const std::vector<index_t> &input0_shape,
+    const std::vector<index_t> &input1_shape,
+    const std::vector<index_t> &output_shape,
+    DstType *output) {
+  const index_t output_size = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<index_t>());
+  std::vector<index_t> out_index(output_shape.size(), 0);
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] = input0[idx0] + input1[idx1];
+          IncreaseIndex(output_shape, &out_index);
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] =
+              input0[idx0] * coeff_copy[0] + input1[idx1] * coeff_copy[1];
+          IncreaseIndex(output_shape, &out_index);
+        }
+      }
+      break;
+    case SUB:
+      if (!swapped) {
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] = input0[idx0] - input1[idx1];
+          IncreaseIndex(output_shape, &out_index);
+        }
+      } else {
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] = input1[idx1] - input0[idx0];
+          IncreaseIndex(output_shape, &out_index);
+        }
+      }
+      break;
+    case PROD:
+      for (index_t i = 0; i < output_size; ++i) {
+        const index_t idx0 = GetIndex(input0_shape, out_index);
+        const index_t idx1 = GetIndex(input1_shape, out_index);
+        output[i] = input0[idx0] * input1[idx1];
+        IncreaseIndex(output_shape, &out_index);
+      }
+      break;
+    case DIV:
+      if (!swapped) {
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] = input0[idx0] / input1[idx1];
+          IncreaseIndex(output_shape, &out_index);
+        }
+      } else {
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] = input1[idx1] / input0[idx0];
+          IncreaseIndex(output_shape, &out_index);
+        }
+      }
+      break;
+    case MIN:
+      for (index_t i = 0; i < output_size; ++i) {
+        const index_t idx0 = GetIndex(input0_shape, out_index);
+        const index_t idx1 = GetIndex(input1_shape, out_index);
+        output[i] = std::min(input1[idx1], input0[idx0]);
+        IncreaseIndex(output_shape, &out_index);
+      }
+      break;
+    case MAX:
+      for (index_t i = 0; i < output_size; ++i) {
+        const index_t idx0 = GetIndex(input0_shape, out_index);
+        const index_t idx1 = GetIndex(input1_shape, out_index);
+        output[i] = std::max(input1[idx1], input0[idx0]);
+        IncreaseIndex(output_shape, &out_index);
+      }
+      break;
+    case SQR_DIFF:
+      for (index_t i = 0; i < output_size; ++i) {
+        const index_t idx0 = GetIndex(input0_shape, out_index);
+        const index_t idx1 = GetIndex(input1_shape, out_index);
+        output[i] = std::pow(input1[idx1] - input0[idx0], 2.f);
+        IncreaseIndex(output_shape, &out_index);
+      }
+      break;
+    case POW:
+      if (!swapped) {
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] = std::pow(input0[idx0], input1[idx1]);
+          IncreaseIndex(output_shape, &out_index);
+        }
+      } else {
+        for (index_t i = 0; i < output_size; ++i) {
+          const index_t idx0 = GetIndex(input0_shape, out_index);
+          const index_t idx1 = GetIndex(input1_shape, out_index);
+          output[i] = std::pow(input1[idx1], input0[idx0]);
+          IncreaseIndex(output_shape, &out_index);
+        }
+      }
+      break;
+    case EQUAL:
+      for (index_t i = 0; i < output_size; ++i) {
+        const index_t idx0 = GetIndex(input0_shape, out_index);
+        const index_t idx1 = GetIndex(input1_shape, out_index);
+        output[i] = input1[idx1] == input0[idx0];
+        IncreaseIndex(output_shape, &out_index);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+
+template <typename T, typename DstType>
+inline void TensorBroadcastEltwise(const EltwiseType type,
+                                   const T *input0,
+                                   const T *input1,
+                                   const std::vector<float> &coeff,
+                                   const index_t diff_size,
+                                   const index_t common_size,
+                                   const bool swapped,
+                                   DstType *output) {
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] + input1[i];
+          }
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] * coeff_copy[0] +
+                    input1[i] * coeff_copy[1];
+          }
+        }
+      }
+      break;
+    case SUB:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] - input1[i];
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input1[i] - input0[i + d * common_size];
+          }
+        }
+      }
+      break;
+    case PROD:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] = input0[i + d * common_size] * input1[i];
+        }
+      }
+      break;
+    case DIV:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input0[i + d * common_size] / input1[i];
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                input1[i] / input0[i + d * common_size];
+          }
+        }
+      }
+      break;
+    case MIN:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::min(input0[i + d * common_size], input1[i]);
+        }
+      }
+      break;
+    case MAX:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::max(input0[i + d * common_size], input1[i]);
+        }
+      }
+      break;
+    case SQR_DIFF:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              std::pow(input0[i + d * common_size] - input1[i], 2.f);
+        }
+      }
+      break;
+    case POW:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                std::pow(input0[i + d * common_size], input1[i]);
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t d = 0; d < diff_size; ++d) {
+          for (index_t i = 0; i < common_size; ++i) {
+            output[i + d * common_size] =
+                std::pow(input1[i], input0[i + d * common_size]);
+          }
+        }
+      }
+      break;
+    case NEG:
+#pragma omp parallel for
+      for (index_t i = 0; i < diff_size * common_size; ++i) {
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < diff_size * common_size; ++i) {
+        output[i] = std::fabs(input0[i]);
+      }
+      break;
+    case EQUAL:
+#pragma omp parallel for collapse(2)
+      for (index_t d = 0; d < diff_size; ++d) {
+        for (index_t i = 0; i < common_size; ++i) {
+          output[i + d * common_size] =
+              input0[i + d * common_size] == input1[i];
+        }
+      }
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+
+// Multiplication is costly, so we specialize the following case.
+template <typename T, typename DstType>
+inline void TensorEltwise(const EltwiseType type,
+                          const T *input0,
+                          const T *input1,
+                          const std::vector<float> &coeff,
+                          const index_t size,
+                          const bool swapped,
+                          DstType *output) {
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] + input1[i];
+        }
+
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
+        }
+      }
+      break;
+    case SUB:
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] - input1[i];
+        }
+
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1[i] - input0[i];
+        }
+      }
+      break;
+    case PROD:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = input0[i] * input1[i];
+      }
+
+      break;
+    case DIV:
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] / input1[i];
+        }
+
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1[i] / input0[i];
+        }
+      }
+      break;
+    case MIN:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::min(input0[i], input1[i]);
+      }
+
+      break;
+    case MAX:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::max(input0[i], input1[i]);
+      }
+
+      break;
+    case SQR_DIFF:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::pow(input0[i] - input1[i], 2.f);
+      }
+
+      break;
+    case POW:
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input0[i], input1[i]);
+        }
+      } else {
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input1[i], input0[i]);
+        }
+      }
+      break;
+    case NEG:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::fabs(input0[i]);
+      }
+      break;
+    case EQUAL:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = input0[i] == input1[i];
+      }
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+
+// Multiplication is costly, so we specialize the following case.
+template <typename T, typename DstType>
+inline void TensorScalarEltwise(const EltwiseType type,
+                                const T *input0,
+                                const T input1,
+                                const std::vector<float> &coeff,
+                                const index_t size,
+                                const bool swapped,
+                                DstType *output) {
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] + input1;
+        }
+
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
+        }
+      }
+      break;
+    case SUB:
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] - input1;
+        }
+
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1 - input0[i];
+        }
+      }
+      break;
+    case PROD:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = input0[i] * input1;
+      }
+
+      break;
+    case DIV:
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input0[i] / input1;
+        }
+
+      } else {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = input1 / input0[i];
+        }
+      }
+      break;
+    case MIN:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::min(input0[i], input1);
+      }
+
+      break;
+    case MAX:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::max(input0[i], input1);
+      }
+
+      break;
+    case SQR_DIFF:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::pow(input0[i] - input1, 2.f);
+      }
+
+      break;
+    case POW:
+      if (!swapped) {
+#pragma omp parallel for
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input0[i], input1);
+        }
+      } else {
+        for (index_t i = 0; i < size; ++i) {
+          output[i] = std::pow(input1, input0[i]);
+        }
+      }
+      break;
+    case NEG:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = std::fabs(input0[i]);
+      }
+      break;
+    case EQUAL:
+#pragma omp parallel for
+      for (index_t i = 0; i < size; ++i) {
+        output[i] = input0[i] == input1;
+      }
+
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+
+template <typename T, typename DstType>
+inline void TensorEltwisePerChannel(const EltwiseType type,
+                                    const T *input0,
+                                    const T *input1,
+                                    const std::vector<float> &coeff,
+                                    const index_t batch0,
+                                    const index_t batch1,
+                                    const index_t channel,
+                                    const index_t image_size,
+                                    const bool swapped,
+                                    DstType *output) {
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] + in1_ptr[c];
+            }
+          }
+        }
+      } else {
+        std::vector<float> coeff_copy = coeff;
+        if (swapped) {
+          std::swap(coeff_copy[0], coeff_copy[1]);
+        }
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] =
+                  in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
+            }
+          }
+        }
+      }
+      break;
+    case SUB:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] - in1_ptr[c];
+            }
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in1_ptr[c] - in0_ptr[i];
+            }
+          }
+        }
+      }
+      break;
+    case PROD:
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
+        for (index_t c = 0; c < channel; ++c) {
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = in0_ptr[i] * in1_ptr[c];
+          }
+        }
+      }
+      break;
+    case DIV:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in0_ptr[i] / in1_ptr[c];
+            }
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = in1_ptr[c] / in0_ptr[i];
+            }
+          }
+        }
+      }
+      break;
+    case MIN:
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
+        for (index_t c = 0; c < channel; ++c) {
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
+          }
+        }
+      }
+      break;
+    case MAX:
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
+        for (index_t c = 0; c < channel; ++c) {
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
+          }
+        }
+      }
+      break;
+    case SQR_DIFF:
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
+        for (index_t c = 0; c < channel; ++c) {
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
+          }
+        }
+      }
+      break;
+    case POW:
+      if (!swapped) {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
+            }
+          }
+        }
+      } else {
+#pragma omp parallel for collapse(2)
+        for (index_t b = 0; b < batch0; ++b) {
+          for (index_t c = 0; c < channel; ++c) {
+            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+            DstType *out_ptr = output + ((b * channel) + c) * image_size;
+            for (index_t i = 0; i < image_size; ++i) {
+              out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
+            }
+          }
+        }
+      }
+      break;
+    case NEG:
+#pragma omp parallel for
+      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
+        output[i] = -input0[i];
+      }
+      break;
+    case ABS:
+#pragma omp parallel for
+      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
+        output[i] = std::fabs(input0[i]);
+      }
+      break;
+    case EQUAL:
+#pragma omp parallel for collapse(2)
+      for (index_t b = 0; b < batch0; ++b) {
+        for (index_t c = 0; c < channel; ++c) {
+          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
+          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
+          DstType *out_ptr = output + ((b * channel) + c) * image_size;
+          for (index_t i = 0; i < image_size; ++i) {
+            out_ptr[i] = in0_ptr[i] == in1_ptr[c];
+          }
+        }
+      }
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+
+template <DeviceType D, class T>
+class EltwiseOp : public Operation {
+ public:
+  explicit EltwiseOp(OpConstructContext *context)
+      : Operation(context),
+        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        coeff_(Operation::GetRepeatedArgs<float>("coeff")),
+        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
+        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
+            "scalar_input_index", 1)),
+        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
+            "data_format", 0))) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input0 = this->Input(0);
+    const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
+    Tensor *output = this->Output(0);
+    if (input1 == nullptr) {
+      scalar_tensor_.Resize({});
+      Tensor::MappingGuard guard(&scalar_tensor_);
+      auto scalar_data = scalar_tensor_.mutable_data<T>();
+      scalar_data[0] = static_cast<T>(scalar_input_);
+      input1 = &scalar_tensor_;
+    }
+
+    if (IsLogicalType(type_)) {
+      // as we do not have bool-type tensor, we use int type
+      return DoEltwise<int32_t>(input0, input1, output);
+    } else {
+      return DoEltwise<T>(input0, input1, output);
+    }
+  }
+
+ private:
+  template <typename DstType>
+  MaceStatus DoEltwise(const Tensor *input0,
+                       const Tensor *input1,
+                       Tensor *output) {
+    bool swapped = false;
+    if (input0->size() < input1->size()) {
+      std::swap(input0, input1);
+      swapped = true;
+    }
+    if (scalar_input_index_ == 0) {
+      swapped = !swapped;
+    }
+
+    // check if we can broadcast tensor
+    uint32_t rank_diff =
+        static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
+    if (data_format_ == NCHW) {
+      MACE_CHECK(
+          (input0->dim_size() == 4) &&
+              ((input1->dim_size() == 0) ||
+                  (input1->dim_size() == 4 &&
+                      input1->dim(1) == input0->dim(1) &&
+                      (input1->dim(0) == input0->dim(0) ||
+                          input1->dim(0) == 1)) ||
+                  (input1->dim_size() == 1 &&
+                      input1->dim(0) == input0->dim(1))),
+          "only support broadcast channel dimension");
+    } else {
+      for (uint32_t i = 0; i < input1->dim_size(); ++i) {
+        MACE_CHECK(input0->dim(rank_diff + i) == 1 || input1->dim(i) == 1 ||
+            input0->dim(rank_diff + i) == input1->dim(i),
+                   "Element-Wise op only support tail dimensions broadcast");
+      }
+    }
+
+    Tensor::MappingGuard input0_guard(input0);
+    Tensor::MappingGuard input1_guard(input1);
+
+    const T *input0_ptr = input0->data<T>();
+    const T *input1_ptr = input1->data<T>();
+
+    if (data_format_ == NCHW && input1->dim_size() > 0 &&
+        input1->size() < input0->size()) {
+      MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
+      Tensor::MappingGuard output_guard(output);
+      DstType *output_ptr = output->mutable_data<DstType>();
+      TensorEltwisePerChannel(
+          type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
+          input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
+          input0->dim(2) * input0->dim(3), swapped, output_ptr);
+
+    } else {
+      const std::vector<index_t> &input0_shape = input0->shape();
+      std::vector<index_t> input1_shape(rank_diff, 1);
+      input1_shape.insert(input1_shape.end(), input1->shape().begin(),
+                          input1->shape().end());
+
+      std::vector<index_t> output_shape(input0->dim_size(), 0);
+      for (unsigned int i = 0; i < input0_shape.size(); ++i) {
+        output_shape[i] = std::max(input0_shape[i], input1_shape[i]);
+      }
+      MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+      Tensor::MappingGuard output_guard(output);
+      DstType *output_ptr = output->mutable_data<DstType>();
+
+      bool need_general_broadcast = false;
+      for (uint32_t i = 0; i < input1->dim_size(); ++i) {
+        if ((input0->dim(rank_diff + i) == 1 && input1->dim(i) > 1) ||
+            (input0->dim(rank_diff + i) > 1 && input1->dim(i) == 1)) {
+          need_general_broadcast = true;
+          break;
+        }
+      }
+
+      if (need_general_broadcast) {
+        TensorGeneralBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
+                                      swapped, input0_shape, input1_shape,
+                                      output_shape, output_ptr);
+      } else if (input1->size() == input0->size()) {
+        TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
+                      swapped, output_ptr);
+      } else if (input1->size() < input0->size()) {
+        if (input1->size() > 1) {
+          index_t common_size = input1->size();
+          index_t diff_size = input0->size() / common_size;
+          TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
+                                 diff_size, common_size, swapped, output_ptr);
+        } else {
+          TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_,
+                              input0->size(), swapped, output_ptr);
+        }
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+  DataFormat data_format_;
+  Tensor scalar_tensor_;
+};
+
+template <>
+class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
+ public:
+  explicit EltwiseOp(OpConstructContext *context)
+      : Operation(context),
+        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        coeff_(Operation::GetRepeatedArgs<float>("coeff")),
+        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
+        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
+            "scalar_input_index", 1)),
+        data_format_(static_cast<DataFormat>(Operation::GetOptionalArg<int>(
+            "data_format", 0))) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input0 = this->Input(0);
+    const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
+    Tensor *output = this->Output(0);
+    MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. ");
+    MACE_CHECK(input0->size() == input1->size(),
+               "input0 and input1 must have the same shape.");
+    MACE_CHECK(output->scale() != 0);
+    MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
+
+    constexpr int left_shift = 20;
+    const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
+    const double adjusted_input0_scale = input0->scale() / doubled_scale;
+    const double adjusted_input1_scale = input1->scale() / doubled_scale;
+    const double adjusted_output_scale =
+        doubled_scale / ((1 << left_shift) * output->scale());
+
+    int32_t input0_multiplier;
+    int32_t input1_multiplier;
+    int32_t output_multiplier;
+    int32_t input0_shift;
+    int32_t input1_shift;
+    int32_t output_shift;
+    QuantizeMultiplier(adjusted_input0_scale,
+                       &input0_multiplier,
+                       &input0_shift);
+    QuantizeMultiplier(adjusted_input1_scale,
+                       &input1_multiplier,
+                       &input1_shift);
+    QuantizeMultiplier(adjusted_output_scale,
+                       &output_multiplier,
+                       &output_shift);
+
+    Tensor::MappingGuard input0_guard(input0);
+    Tensor::MappingGuard input1_guard(input1);
+    Tensor::MappingGuard output_guard(output);
+
+    auto input0_ptr = input0->data<uint8_t>();
+    auto input1_ptr = input1->data<uint8_t>();
+    auto output_ptr = output->mutable_data<uint8_t>();
+
+    index_t handled_output_size = 0;
+#ifdef MACE_ENABLE_NEON
+    #pragma omp parallel for
+    for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
+      const auto input0_val = vld1_u8(input0_ptr + i);
+      const auto input1_val = vld1_u8(input1_ptr + i);
+      const auto input0_val_s16 =
+          vreinterpretq_s16_u16(vmovl_u8(input0_val));
+      const auto input1_val_s16 =
+          vreinterpretq_s16_u16(vmovl_u8(input1_val));
+      const auto offset_input0 =
+          vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
+      const auto offset_input1 =
+          vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
+      auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
+      auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
+      auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
+      auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
+      const auto left_shift_dup = vdupq_n_s32(left_shift);
+      input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
+      input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
+      input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
+      input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
+      input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
+      input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
+      input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
+      input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
+      const auto input0_shift_dup = vdupq_n_s32(input0_shift);
+      const auto input1_shift_dup = vdupq_n_s32(input1_shift);
+      input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
+      input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
+      input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
+      input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
+      auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32);
+      auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32);
+      sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier);
+      sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier);
+      sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift);
+      sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift);
+      const auto sum_low_s16 = vmovn_s32(sum_low);
+      const auto sum_high_s16 = vmovn_s32(sum_high);
+      const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16,
+                                                     sum_high_s16),
+                                        vdupq_n_s16(output->zero_point()));
+      vst1_u8(output_ptr + i, vqmovun_s16(output_val));
+    }
+    handled_output_size = output->size() - output->size() % 8;
+#endif  // NEON
+#pragma omp parallel for
+    for (index_t i = handled_output_size; i < output->size(); ++i) {
+      const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
+      const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
+      const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
+      const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
+      const int32_t multiplied_input0 =
+          gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
+                                                          input0_multiplier),
+              -input0_shift);
+      const int32_t multiplied_input1 =
+          gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
+                                                          input1_multiplier),
+              -input1_shift);
+      const int32_t sum = multiplied_input0 + multiplied_input1;
+      const int32_t output_val =
+          gemmlowp::RoundingDivideByPOT(
+              gemmlowp::SaturatingRoundingDoublingHighMul(sum,
+                                                          output_multiplier),
+              -output_shift) + output->zero_point();
+      output_ptr[i] = Saturate<uint8_t>(output_val);
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+  DataFormat data_format_;
+  Tensor scalar_tensor_;
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class EltwiseOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit EltwiseOp(OpConstructContext *context)
+      : Operation(context) {
+    EltwiseType type = static_cast<kernels::EltwiseType>(
+        Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(kernels::EltwiseType::NONE)));
+    std::vector<float> coeff = Operation::GetRepeatedArgs<float>("coeff");
+    float scalar_input = Operation::GetOptionalArg<float>("scalar_input", 1.0);
+    int32_t scalar_input_index = Operation::GetOptionalArg<int32_t>(
+            "scalar_input_index", 1);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::EltwiseKernel<T>(
+          type, coeff, scalar_input, scalar_input_index));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input0 = this->Input(0);
+    const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
+    Tensor *output = this->Output(0);
+    return kernel_->Compute(context, input0, input1, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLEltwiseKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterEltwise(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
+                   DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
+                   DeviceType::CPU, int32_t);
+
+  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
+                   DeviceType::CPU, uint8_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Eltwise", EltwiseOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h
index d507011a..b71f4e42 100644
--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -15,18 +15,6 @@
 #ifndef MACE_KERNELS_ELTWISE_H_
 #define MACE_KERNELS_ELTWISE_H_
 
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/utils/quantize.h"
-
 namespace mace {
 namespace kernels {
 
@@ -45,1071 +33,7 @@ enum EltwiseType {
   NONE = 11,
 };
 
-static bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
-
-inline index_t GetIndex(const std::vector<index_t> &shape,
-                        const std::vector<index_t> &index) {
-  index_t idx = 0;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] > 1) {
-      idx = idx * shape[i] + index[i];
-    }
-  }
-  return idx;
-}
-
-inline void IncreaseIndex(const std::vector<index_t> &shape,
-                          std::vector<index_t> *index) {
-  for (index_t i = static_cast<index_t>(shape.size()) - 1; i >= 0; --i) {
-    ++(*index)[i];
-    if ((*index)[i] >= shape[i]) {
-      (*index)[i] -= shape[i];
-    } else {
-      break;
-    }
-  }
-}
-
-template <typename T, typename DstType>
-inline void TensorGeneralBroadcastEltwise(
-    const EltwiseType type,
-    const T *input0,
-    const T *input1,
-    const std::vector<float> &coeff,
-    const bool swapped,
-    const std::vector<index_t> &input0_shape,
-    const std::vector<index_t> &input1_shape,
-    const std::vector<index_t> &output_shape,
-    DstType *output) {
-  const index_t output_size = std::accumulate(
-      output_shape.begin(), output_shape.end(), 1, std::multiplies<index_t>());
-  std::vector<index_t> out_index(output_shape.size(), 0);
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] = input0[idx0] + input1[idx1];
-          IncreaseIndex(output_shape, &out_index);
-        }
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] =
-              input0[idx0] * coeff_copy[0] + input1[idx1] * coeff_copy[1];
-          IncreaseIndex(output_shape, &out_index);
-        }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] = input0[idx0] - input1[idx1];
-          IncreaseIndex(output_shape, &out_index);
-        }
-      } else {
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] = input1[idx1] - input0[idx0];
-          IncreaseIndex(output_shape, &out_index);
-        }
-      }
-      break;
-    case PROD:
-      for (index_t i = 0; i < output_size; ++i) {
-        const index_t idx0 = GetIndex(input0_shape, out_index);
-        const index_t idx1 = GetIndex(input1_shape, out_index);
-        output[i] = input0[idx0] * input1[idx1];
-        IncreaseIndex(output_shape, &out_index);
-      }
-      break;
-    case DIV:
-      if (!swapped) {
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] = input0[idx0] / input1[idx1];
-          IncreaseIndex(output_shape, &out_index);
-        }
-      } else {
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] = input1[idx1] / input0[idx0];
-          IncreaseIndex(output_shape, &out_index);
-        }
-      }
-      break;
-    case MIN:
-      for (index_t i = 0; i < output_size; ++i) {
-        const index_t idx0 = GetIndex(input0_shape, out_index);
-        const index_t idx1 = GetIndex(input1_shape, out_index);
-        output[i] = std::min(input1[idx1], input0[idx0]);
-        IncreaseIndex(output_shape, &out_index);
-      }
-      break;
-    case MAX:
-      for (index_t i = 0; i < output_size; ++i) {
-        const index_t idx0 = GetIndex(input0_shape, out_index);
-        const index_t idx1 = GetIndex(input1_shape, out_index);
-        output[i] = std::max(input1[idx1], input0[idx0]);
-        IncreaseIndex(output_shape, &out_index);
-      }
-      break;
-    case SQR_DIFF:
-      for (index_t i = 0; i < output_size; ++i) {
-        const index_t idx0 = GetIndex(input0_shape, out_index);
-        const index_t idx1 = GetIndex(input1_shape, out_index);
-        output[i] = std::pow(input1[idx1] - input0[idx0], 2.f);
-        IncreaseIndex(output_shape, &out_index);
-      }
-      break;
-    case POW:
-      if (!swapped) {
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] = std::pow(input0[idx0], input1[idx1]);
-          IncreaseIndex(output_shape, &out_index);
-        }
-      } else {
-        for (index_t i = 0; i < output_size; ++i) {
-          const index_t idx0 = GetIndex(input0_shape, out_index);
-          const index_t idx1 = GetIndex(input1_shape, out_index);
-          output[i] = std::pow(input1[idx1], input0[idx0]);
-          IncreaseIndex(output_shape, &out_index);
-        }
-      }
-      break;
-    case EQUAL:
-      for (index_t i = 0; i < output_size; ++i) {
-        const index_t idx0 = GetIndex(input0_shape, out_index);
-        const index_t idx1 = GetIndex(input1_shape, out_index);
-        output[i] = input1[idx1] == input0[idx0];
-        IncreaseIndex(output_shape, &out_index);
-      }
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
-}
-
-template <typename T, typename DstType>
-inline void TensorBroadcastEltwise(const EltwiseType type,
-                                   const T *input0,
-                                   const T *input1,
-                                   const std::vector<float> &coeff,
-                                   const index_t diff_size,
-                                   const index_t common_size,
-                                   const bool swapped,
-                                   DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] + input1[i];
-          }
-        }
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] * coeff_copy[0] +
-                input1[i] * coeff_copy[1];
-          }
-        }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] - input1[i];
-          }
-        }
-      } else {
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input1[i] - input0[i + d * common_size];
-          }
-        }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for collapse(2)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] = input0[i + d * common_size] * input1[i];
-        }
-      }
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input0[i + d * common_size] / input1[i];
-          }
-        }
-      } else {
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                input1[i] / input0[i + d * common_size];
-          }
-        }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for collapse(2)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              std::min(input0[i + d * common_size], input1[i]);
-        }
-      }
-      break;
-    case MAX:
-#pragma omp parallel for collapse(2)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              std::max(input0[i + d * common_size], input1[i]);
-        }
-      }
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for collapse(2)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              std::pow(input0[i + d * common_size] - input1[i], 2.f);
-        }
-      }
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                std::pow(input0[i + d * common_size], input1[i]);
-          }
-        }
-      } else {
-#pragma omp parallel for collapse(2)
-        for (index_t d = 0; d < diff_size; ++d) {
-          for (index_t i = 0; i < common_size; ++i) {
-            output[i + d * common_size] =
-                std::pow(input1[i], input0[i + d * common_size]);
-          }
-        }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for
-      for (index_t i = 0; i < diff_size * common_size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for
-      for (index_t i = 0; i < diff_size * common_size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for collapse(2)
-      for (index_t d = 0; d < diff_size; ++d) {
-        for (index_t i = 0; i < common_size; ++i) {
-          output[i + d * common_size] =
-              input0[i + d * common_size] == input1[i];
-        }
-      }
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
-}
-
-// Multiplication is costly, so we specialize the following case.
-template <typename T, typename DstType>
-inline void TensorEltwise(const EltwiseType type,
-                          const T *input0,
-                          const T *input1,
-                          const std::vector<float> &coeff,
-                          const index_t size,
-                          const bool swapped,
-                          DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] + input1[i];
-        }
-
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] * coeff_copy[0] + input1[i] * coeff_copy[1];
-        }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] - input1[i];
-        }
-
-      } else {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1[i] - input0[i];
-        }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * input1[i];
-      }
-
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] / input1[i];
-        }
-
-      } else {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1[i] / input0[i];
-        }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min(input0[i], input1[i]);
-      }
-
-      break;
-    case MAX:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max(input0[i], input1[i]);
-      }
-
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i] - input1[i], 2.f);
-      }
-
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input0[i], input1[i]);
-        }
-      } else {
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input1[i], input0[i]);
-        }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] == input1[i];
-      }
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
-}
-
-// Multiplication is costly, so we specialize the following case.
-template <typename T, typename DstType>
-inline void TensorScalarEltwise(const EltwiseType type,
-                                const T *input0,
-                                const T input1,
-                                const std::vector<float> &coeff,
-                                const index_t size,
-                                const bool swapped,
-                                DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] + input1;
-        }
-
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] * coeff_copy[0] + input1 * coeff_copy[1];
-        }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] - input1;
-        }
-
-      } else {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1 - input0[i];
-        }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] * input1;
-      }
-
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input0[i] / input1;
-        }
-
-      } else {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = input1 / input0[i];
-        }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::min(input0[i], input1);
-      }
-
-      break;
-    case MAX:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::max(input0[i], input1);
-      }
-
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::pow(input0[i] - input1, 2.f);
-      }
-
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input0[i], input1);
-        }
-      } else {
-        for (index_t i = 0; i < size; ++i) {
-          output[i] = std::pow(input1, input0[i]);
-        }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        output[i] = input0[i] == input1;
-      }
-
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
-}
-
-template <typename T, typename DstType>
-inline void TensorEltwisePerChannel(const EltwiseType type,
-                                    const T *input0,
-                                    const T *input1,
-                                    const std::vector<float> &coeff,
-                                    const index_t batch0,
-                                    const index_t batch1,
-                                    const index_t channel,
-                                    const index_t image_size,
-                                    const bool swapped,
-                                    DstType *output) {
-  switch (type) {
-    case SUM:
-      if (coeff.empty()) {
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in0_ptr[i] + in1_ptr[c];
-            }
-          }
-        }
-      } else {
-        std::vector<float> coeff_copy = coeff;
-        if (swapped) {
-          std::swap(coeff_copy[0], coeff_copy[1]);
-        }
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] =
-                  in0_ptr[i] * coeff_copy[0] + in1_ptr[c] * coeff_copy[1];
-            }
-          }
-        }
-      }
-      break;
-    case SUB:
-      if (!swapped) {
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in0_ptr[i] - in1_ptr[c];
-            }
-          }
-        }
-      } else {
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in1_ptr[c] - in0_ptr[i];
-            }
-          }
-        }
-      }
-      break;
-    case PROD:
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = in0_ptr[i] * in1_ptr[c];
-          }
-        }
-      }
-      break;
-    case DIV:
-      if (!swapped) {
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in0_ptr[i] / in1_ptr[c];
-            }
-          }
-        }
-      } else {
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = in1_ptr[c] / in0_ptr[i];
-            }
-          }
-        }
-      }
-      break;
-    case MIN:
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = std::min(in0_ptr[i], in1_ptr[c]);
-          }
-        }
-      }
-      break;
-    case MAX:
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = std::max(in0_ptr[i], in1_ptr[c]);
-          }
-        }
-      }
-      break;
-    case SQR_DIFF:
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = std::pow(in0_ptr[i] - in1_ptr[c], 2.f);
-          }
-        }
-      }
-      break;
-    case POW:
-      if (!swapped) {
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = std::pow(in0_ptr[i], in1_ptr[c]);
-            }
-          }
-        }
-      } else {
-#pragma omp parallel for collapse(2)
-        for (index_t b = 0; b < batch0; ++b) {
-          for (index_t c = 0; c < channel; ++c) {
-            const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-            const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-            DstType *out_ptr = output + ((b * channel) + c) * image_size;
-            for (index_t i = 0; i < image_size; ++i) {
-              out_ptr[i] = std::pow(in1_ptr[c], in0_ptr[i]);
-            }
-          }
-        }
-      }
-      break;
-    case NEG:
-#pragma omp parallel for
-      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
-        output[i] = -input0[i];
-      }
-      break;
-    case ABS:
-#pragma omp parallel for
-      for (index_t i = 0; i < batch0 * channel * image_size; ++i) {
-        output[i] = std::fabs(input0[i]);
-      }
-      break;
-    case EQUAL:
-#pragma omp parallel for collapse(2)
-      for (index_t b = 0; b < batch0; ++b) {
-        for (index_t c = 0; c < channel; ++c) {
-          const T *in0_ptr = input0 + ((b * channel) + c) * image_size;
-          const T *in1_ptr = input1 + (batch1 > 1 ? b * channel : 0);
-          DstType *out_ptr = output + ((b * channel) + c) * image_size;
-          for (index_t i = 0; i < image_size; ++i) {
-            out_ptr[i] = in0_ptr[i] == in1_ptr[c];
-          }
-        }
-      }
-      break;
-    default:
-      LOG(FATAL) << "Eltwise op not support type " << type;
-  }
-}
-
-template <DeviceType D, typename T>
-struct EltwiseFunctor : OpKernel {
-  EltwiseFunctor(OpKernelContext *context,
-                 const EltwiseType type,
-                 const std::vector<float> &coeff,
-                 const float scalar_input,  // float as it comes from arg
-                 const int32_t scalar_input_index,
-                 const DataFormat data_format)
-      : OpKernel(context),
-        type_(type),
-        coeff_(coeff),
-        scalar_input_(scalar_input),
-        scalar_input_index_(scalar_input_index),
-        data_format_(data_format) {}
-
-  template <typename DstType>
-  MaceStatus DoEltwise(const Tensor *input0,
-                       const Tensor *input1,
-                       Tensor *output) {
-    bool swapped = false;
-    if (input0->size() < input1->size()) {
-      std::swap(input0, input1);
-      swapped = true;
-    }
-    if (scalar_input_index_ == 0) {
-      swapped = !swapped;
-    }
-
-    // check if we can broadcast tensor
-    uint32_t rank_diff =
-        static_cast<uint32_t>(input0->dim_size() - input1->dim_size());
-    if (data_format_ == NCHW) {
-      MACE_CHECK(
-          (input0->dim_size() == 4) &&
-              ((input1->dim_size() == 0) ||
-               (input1->dim_size() == 4 && input1->dim(1) == input0->dim(1) &&
-                (input1->dim(0) == input0->dim(0) || input1->dim(0) == 1)) ||
-               (input1->dim_size() == 1 && input1->dim(0) == input0->dim(1))),
-          "only support broadcast channel dimension");
-    } else {
-      for (uint32_t i = 0; i < input1->dim_size(); ++i) {
-        MACE_CHECK(input0->dim(rank_diff + i) == 1 || input1->dim(i) == 1 ||
-                       input0->dim(rank_diff + i) == input1->dim(i),
-                   "Element-Wise op only support tail dimensions broadcast");
-      }
-    }
-
-    Tensor::MappingGuard input0_guard(input0);
-    Tensor::MappingGuard input1_guard(input1);
-
-    const T *input0_ptr = input0->data<T>();
-    const T *input1_ptr = input1->data<T>();
-
-    if (data_format_ == NCHW && input1->dim_size() > 0 &&
-        input1->size() < input0->size()) {
-      MACE_RETURN_IF_ERROR(output->ResizeLike(input0));
-      Tensor::MappingGuard output_guard(output);
-      DstType *output_ptr = output->mutable_data<DstType>();
-      TensorEltwisePerChannel(
-          type_, input0_ptr, input1_ptr, coeff_, input0->dim(0),
-          input1->dim_size() == 1 ? 1 : input1->dim(0), input0->dim(1),
-          input0->dim(2) * input0->dim(3), swapped, output_ptr);
-
-    } else {
-      const std::vector<index_t> &input0_shape = input0->shape();
-      std::vector<index_t> input1_shape(rank_diff, 1);
-      input1_shape.insert(input1_shape.end(), input1->shape().begin(),
-                          input1->shape().end());
-
-      std::vector<index_t> output_shape(input0->dim_size(), 0);
-      for (unsigned int i = 0; i < input0_shape.size(); ++i) {
-        output_shape[i] = std::max(input0_shape[i], input1_shape[i]);
-      }
-      MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-      Tensor::MappingGuard output_guard(output);
-      DstType *output_ptr = output->mutable_data<DstType>();
-
-      bool need_general_broadcast = false;
-      for (uint32_t i = 0; i < input1->dim_size(); ++i) {
-        if ((input0->dim(rank_diff + i) == 1 && input1->dim(i) > 1) ||
-            (input0->dim(rank_diff + i) > 1 && input1->dim(i) == 1)) {
-          need_general_broadcast = true;
-          break;
-        }
-      }
-
-      if (need_general_broadcast) {
-        TensorGeneralBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
-                                      swapped, input0_shape, input1_shape,
-                                      output_shape, output_ptr);
-      } else if (input1->size() == input0->size()) {
-        TensorEltwise(type_, input0_ptr, input1_ptr, coeff_, input0->size(),
-                      swapped, output_ptr);
-      } else if (input1->size() < input0->size()) {
-        if (input1->size() > 1) {
-          index_t common_size = input1->size();
-          index_t diff_size = input0->size() / common_size;
-          TensorBroadcastEltwise(type_, input0_ptr, input1_ptr, coeff_,
-                                 diff_size, common_size, swapped, output_ptr);
-        } else {
-          TensorScalarEltwise(type_, input0_ptr, input1_ptr[0], coeff_,
-                              input0->size(), swapped, output_ptr);
-        }
-      }
-    }
-
-    return MACE_SUCCESS;
-  }
-
-  MaceStatus operator()(const Tensor *input0,
-                        const Tensor *input1,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-
-    if (input1 == nullptr) {
-      scalar_tensor_.Resize({});
-      Tensor::MappingGuard guard(&scalar_tensor_);
-      auto scalar_data = scalar_tensor_.mutable_data<T>();
-      scalar_data[0] = static_cast<T>(scalar_input_);
-      input1 = &scalar_tensor_;
-    }
-
-    if (IsLogicalType(type_)) {
-      // as we do not have bool-type tensor, we use int type
-      return DoEltwise<int32_t>(input0, input1, output);
-    } else {
-      return DoEltwise<T>(input0, input1, output);
-    }
-  }
-
-  EltwiseType type_;
-  std::vector<float> coeff_;
-  float scalar_input_;
-  int32_t scalar_input_index_;
-  DataFormat data_format_;
-  Tensor scalar_tensor_;
-};
-
-template <>
-struct EltwiseFunctor<DeviceType::CPU, uint8_t> : OpKernel {
-  EltwiseFunctor(OpKernelContext *context,
-                 const EltwiseType type,
-                 const std::vector<float> &coeff,
-                 const float scalar_input,  // float as it comes from arg
-                 const int32_t scalar_input_index,
-                 const DataFormat data_format)
-      : OpKernel(context),
-        type_(type),
-        coeff_(coeff),
-        scalar_input_(scalar_input),
-        scalar_input_index_(scalar_input_index),
-        data_format_(data_format) {}
-
-  MaceStatus operator()(const Tensor *input0,
-                        const Tensor *input1,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-
-    MACE_CHECK(type_ == SUM, "Only support Elementwise SUM now. ");
-    MACE_CHECK(input0->size() == input1->size(),
-               "input0 and input1 must have the same shape.");
-    MACE_CHECK(output->scale() != 0);
-    MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
-
-    constexpr int left_shift = 20;
-    const double doubled_scale = 2 * std::max(input0->scale(), input1->scale());
-    const double adjusted_input0_scale = input0->scale() / doubled_scale;
-    const double adjusted_input1_scale = input1->scale() / doubled_scale;
-    const double adjusted_output_scale =
-        doubled_scale / ((1 << left_shift) * output->scale());
-
-    int32_t input0_multiplier;
-    int32_t input1_multiplier;
-    int32_t output_multiplier;
-    int32_t input0_shift;
-    int32_t input1_shift;
-    int32_t output_shift;
-    QuantizeMultiplier(adjusted_input0_scale,
-                       &input0_multiplier,
-                       &input0_shift);
-    QuantizeMultiplier(adjusted_input1_scale,
-                       &input1_multiplier,
-                       &input1_shift);
-    QuantizeMultiplier(adjusted_output_scale,
-                       &output_multiplier,
-                       &output_shift);
-
-    Tensor::MappingGuard input0_guard(input0);
-    Tensor::MappingGuard input1_guard(input1);
-    Tensor::MappingGuard output_guard(output);
-
-    auto input0_ptr = input0->data<uint8_t>();
-    auto input1_ptr = input1->data<uint8_t>();
-    auto output_ptr = output->mutable_data<uint8_t>();
-
-    index_t handled_output_size = 0;
-#ifdef MACE_ENABLE_NEON
-#pragma omp parallel for
-    for (index_t i = handled_output_size; i <= output->size() - 8; i += 8) {
-      const auto input0_val = vld1_u8(input0_ptr + i);
-      const auto input1_val = vld1_u8(input1_ptr + i);
-      const auto input0_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input0_val));
-      const auto input1_val_s16 =
-          vreinterpretq_s16_u16(vmovl_u8(input1_val));
-      const auto offset_input0 =
-          vaddq_s16(input0_val_s16, vdupq_n_s16(-input0->zero_point()));
-      const auto offset_input1 =
-          vaddq_s16(input1_val_s16, vdupq_n_s16(-input1->zero_point()));
-      auto input0_low_s32 = vmovl_s16(vget_low_s16(offset_input0));
-      auto input0_high_s32 = vmovl_s16(vget_high_s16(offset_input0));
-      auto input1_low_s32 = vmovl_s16(vget_low_s16(offset_input1));
-      auto input1_high_s32 = vmovl_s16(vget_high_s16(offset_input1));
-      const auto left_shift_dup = vdupq_n_s32(left_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, left_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, left_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, left_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, left_shift_dup);
-      input0_low_s32 = vqrdmulhq_n_s32(input0_low_s32, input0_multiplier);
-      input0_high_s32 = vqrdmulhq_n_s32(input0_high_s32, input0_multiplier);
-      input1_low_s32 = vqrdmulhq_n_s32(input1_low_s32, input1_multiplier);
-      input1_high_s32 = vqrdmulhq_n_s32(input1_high_s32, input1_multiplier);
-      const auto input0_shift_dup = vdupq_n_s32(input0_shift);
-      const auto input1_shift_dup = vdupq_n_s32(input1_shift);
-      input0_low_s32 = vshlq_s32(input0_low_s32, input0_shift_dup);
-      input0_high_s32 = vshlq_s32(input0_high_s32, input0_shift_dup);
-      input1_low_s32 = vshlq_s32(input1_low_s32, input1_shift_dup);
-      input1_high_s32 = vshlq_s32(input1_high_s32, input1_shift_dup);
-      auto sum_low = vaddq_s32(input0_low_s32, input1_low_s32);
-      auto sum_high = vaddq_s32(input0_high_s32, input1_high_s32);
-      sum_low = vqrdmulhq_n_s32(sum_low, output_multiplier);
-      sum_high = vqrdmulhq_n_s32(sum_high, output_multiplier);
-      sum_low = gemmlowp::RoundingDivideByPOT(sum_low, -output_shift);
-      sum_high = gemmlowp::RoundingDivideByPOT(sum_high, -output_shift);
-      const auto sum_low_s16 = vmovn_s32(sum_low);
-      const auto sum_high_s16 = vmovn_s32(sum_high);
-      const auto output_val = vaddq_s16(vcombine_s16(sum_low_s16,
-                                                     sum_high_s16),
-                                        vdupq_n_s16(output->zero_point()));
-      vst1_u8(output_ptr + i, vqmovun_s16(output_val));
-    }
-    handled_output_size = output->size() - output->size() % 8;
-#endif  // NEON
-#pragma omp parallel for
-    for (index_t i = handled_output_size; i < output->size(); ++i) {
-      const int32_t offset_input0 = input0_ptr[i] - input0->zero_point();
-      const int32_t offset_input1 = input1_ptr[i] - input1->zero_point();
-      const int32_t shifted_input0 = offset_input0 * (1 << left_shift);
-      const int32_t shifted_input1 = offset_input1 * (1 << left_shift);
-      const int32_t multiplied_input0 =
-          gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input0,
-                                                          input0_multiplier),
-              -input0_shift);
-      const int32_t multiplied_input1 =
-          gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(shifted_input1,
-                                                          input1_multiplier),
-              -input1_shift);
-      const int32_t sum = multiplied_input0 + multiplied_input1;
-      const int32_t output_val =
-          gemmlowp::RoundingDivideByPOT(
-              gemmlowp::SaturatingRoundingDoublingHighMul(sum,
-                                                          output_multiplier),
-              -output_shift) + output->zero_point();
-      output_ptr[i] = Saturate<uint8_t>(output_val);
-    }
-
-    return MACE_SUCCESS;
-  }
-
-  EltwiseType type_;
-  std::vector<float> coeff_;
-  float scalar_input_;
-  int32_t scalar_input_index_;
-  DataFormat data_format_;
-  Tensor scalar_tensor_;
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLEltwiseKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input0,
-      const Tensor *input1,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLEltwiseKernel);
-};
-template <typename T>
-struct EltwiseFunctor<DeviceType::GPU, T> : OpKernel {
-  EltwiseFunctor(OpKernelContext *context,
-                 const EltwiseType type,
-                 const std::vector<float> &coeff,
-                 const float scalar_input,
-                 const int32_t scalar_input_index,
-                 const DataFormat data_format);
-
-  MaceStatus operator()(const Tensor *input0,
-                        const Tensor *input1,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLEltwiseKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
+inline bool IsLogicalType(EltwiseType type) { return type == EQUAL; }
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/expand_dims.h b/mace/kernels/expand_dims.cc
similarity index 62%
rename from mace/kernels/expand_dims.h
rename to mace/kernels/expand_dims.cc
index 05cac125..5dc58436 100644
--- a/mace/kernels/expand_dims.h
+++ b/mace/kernels/expand_dims.cc
@@ -12,35 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_EXPAND_DIMS_H_
-#define MACE_KERNELS_EXPAND_DIMS_H_
 
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct ExpandDimsFunctor;
+template <DeviceType D, class T>
+class ExpandDimsOp;
 
 template <typename T>
-struct ExpandDimsFunctor<DeviceType::CPU, T> : OpKernel {
-  explicit ExpandDimsFunctor(OpKernelContext *context, int axis)
-    : OpKernel(context), axis_(axis) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+class ExpandDimsOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit ExpandDimsOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 0)) {}
 
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
     index_t input_dims_size = input->dim_size();
     if ( axis_ < 0 ) {
       axis_ += input_dims_size + 1;
@@ -58,13 +49,23 @@ struct ExpandDimsFunctor<DeviceType::CPU, T> : OpKernel {
     output->ReuseTensorBuffer(*input);
     output->Reshape(output_shape);
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   int axis_;
 };
 
+void RegisterExpandDims(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
+                   DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
+                   DeviceType::CPU, int32_t);
+
+  MACE_REGISTER_OP(op_registry, "ExpandDims", ExpandDimsOp,
+                   DeviceType::CPU, uint8_t);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_EXPAND_DIMS_H_
diff --git a/mace/kernels/fill.h b/mace/kernels/fill.cc
similarity index 69%
rename from mace/kernels/fill.h
rename to mace/kernels/fill.cc
index 131dd9d4..0cd20930 100644
--- a/mace/kernels/fill.h
+++ b/mace/kernels/fill.cc
@@ -12,34 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_FILL_H_
-#define MACE_KERNELS_FILL_H_
 
-#include <algorithm>
-#include <functional>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, class T>
-struct FillFunctor;
+class FillOp;
 
 template <>
-struct FillFunctor<DeviceType::CPU, float> : OpKernel {
-  explicit FillFunctor(OpKernelContext *context) : OpKernel(context) {}
-
-  MaceStatus operator()(const Tensor *shape,
-                        const Tensor *value,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+class FillOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit FillOp(OpConstructContext *context)
+      : Operation(context) {}
 
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *shape = this->Input(SHAPE);
+    const Tensor *value = this->Input(VALUE);
+    Tensor *output = this->Output(OUTPUT);
     MACE_CHECK(shape->dim_size() == 1, "Shape must be 1-D");
     const index_t num_dims = shape->dim(0);
     Tensor::MappingGuard shape_guard(shape);
@@ -61,11 +53,18 @@ struct FillFunctor<DeviceType::CPU, float> : OpKernel {
 
     std::fill(output_data, output_data + output->size(), *value_data);
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
+
+ private:
+  MACE_OP_INPUT_TAGS(SHAPE, VALUE);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
+void RegisterFill(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Fill", FillOp,
+                   DeviceType::CPU, float);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_FILL_H_
diff --git a/mace/kernels/fully_connected.cc b/mace/kernels/fully_connected.cc
new file mode 100644
index 00000000..a7b74c69
--- /dev/null
+++ b/mace/kernels/fully_connected.cc
@@ -0,0 +1,233 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mace/core/future.h"
+#include "mace/core/operator.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/gemm.h"
+#include "mace/kernels/gemmlowp_util.h"
+
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/fully_connected.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+class FullyConnectedOpBase : public Operation {
+ public:
+  explicit FullyConnectedOpBase(OpConstructContext *context)
+      : Operation(context),
+        activation_(kernels::StringToActivationType(
+            Operation::GetOptionalArg<std::string>("activation",
+                                                  "NOOP"))),
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+ protected:
+  const ActivationType activation_;
+  const float relux_max_limit_;
+
+  MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+template <DeviceType D, class T>
+class FullyConnectedOp;
+
+template <>
+class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
+ public:
+  explicit FullyConnectedOp(OpConstructContext *context)
+      : FullyConnectedOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *weight = this->Input(WEIGHT);  // OIHW
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+
+    MACE_CHECK(
+        input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) &&
+            input->dim(3) == weight->dim(3),
+        "The shape of Input: ", MakeString(input->shape()),
+        "The shape of Weight: ", MakeString(weight->shape()),
+        " don't match.");
+    if (bias) {
+      MACE_CHECK(weight->dim(0) == bias->dim(0),
+                 "The shape of Weight: ", MakeString(weight->shape()),
+                 " and shape of Bias: ", bias->dim(0),
+                 " don't match.");
+    }
+    std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    const index_t N = output->dim(0);
+    const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
+    const index_t output_size = weight->dim(0);
+
+    Tensor::MappingGuard guard_input(input);
+    Tensor::MappingGuard guard_weight(weight);
+    Tensor::MappingGuard guard_output(output);
+    const float *input_ptr = input->data<float>();
+    const float *weight_ptr = weight->data<float>();
+    float *output_ptr = output->mutable_data<float>();
+
+    Gemv(weight_ptr, input_ptr, N, input_size, output_size, output_ptr);
+
+    if (bias) {
+      Tensor::MappingGuard guard_bias(bias);
+      const float *bias_ptr = bias == nullptr ? nullptr : bias->data<float>();
+      for (int i = 0; i < N; ++i) {
+        for (int j = 0; j < output_size; ++j) {
+          output_ptr[j + i * output_size] += bias_ptr[j];
+        }
+      }
+    }
+
+    DoActivation(output_ptr, output_ptr, output->size(), activation_,
+                 relux_max_limit_);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+template <>
+class FullyConnectedOp<DeviceType::CPU, uint8_t>
+    : public FullyConnectedOpBase {
+ public:
+  explicit FullyConnectedOp(OpConstructContext *context)
+      : FullyConnectedOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *weight = this->Input(WEIGHT);  // OIHW
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+
+    MACE_CHECK(
+        input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) &&
+            input->dim(3) == weight->dim(3),
+        "The shape of Input: ", MakeString(input->shape()),
+        "The shape of Weight: ", MakeString(weight->shape()),
+        " don't match.");
+    if (bias) {
+      MACE_CHECK(weight->dim(0) == bias->dim(0),
+                 "The shape of Weight: ", MakeString(weight->shape()),
+                 " and shape of Bias: ", bias->dim(0),
+                 " don't match.");
+    }
+    auto gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext();
+    MACE_CHECK_NOTNULL(gemm_context);
+
+    std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
+    const int N = static_cast<int>(output->dim(0));
+    const int input_size =
+        static_cast<int>(weight->dim(1) * weight->dim(2) * weight->dim(3));
+    const int output_size = static_cast<int>(weight->dim(0));
+
+    Tensor::MappingGuard guard_input(input);
+    Tensor::MappingGuard guard_weight(weight);
+    Tensor::MappingGuard guard_output(output);
+    auto input_ptr = input->data<uint8_t>();
+    auto weight_ptr = weight->data<uint8_t>();
+    auto output_ptr = output->mutable_data<uint8_t>();
+
+    std::vector<index_t> bias_shape{output_size};
+    std::unique_ptr<Tensor> zero_bias;
+    const int32_t *bias_ptr = nullptr;
+    if (bias == nullptr) {
+      zero_bias.reset(
+          new Tensor(GetCPUAllocator(), DT_INT32));
+      zero_bias->Resize(bias_shape);
+      zero_bias->Clear();
+      bias_ptr = zero_bias->data<int32_t>();
+    } else {
+      bias_ptr = bias->data<int32_t>();
+    }
+
+    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
+        weight_matrix(weight_ptr, output_size, input_size);
+    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor>
+        input_matrix(input_ptr, input_size, N);
+    gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor>
+        output_matrix(output_ptr, output_size, N);
+
+    const auto &output_pipeline = GemmlowpOutputPipeline::Make(
+        bias_ptr, output_size, weight->scale(), input->scale(), output->scale(),
+        output->zero_point());
+
+    using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
+    gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
+        gemm_context, weight_matrix, input_matrix, &output_matrix,
+        -weight->zero_point(), -input->zero_point(), output_pipeline);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
+ public:
+  explicit FullyConnectedOp(OpConstructContext *context)
+      : FullyConnectedOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *weight = this->Input(WEIGHT);  // OIHW
+    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
+    Tensor *output = this->Output(OUTPUT);
+
+    MACE_CHECK(
+        input->dim(1) == weight->dim(2) && input->dim(2) == weight->dim(3) &&
+            input->dim(3) == weight->dim(1),
+        "The shape of Input: ", MakeString(input->shape()),
+        "The shape of Weight: ", MakeString(weight->shape()),
+        " don't match.");
+    return kernel_->Compute(
+        context, input, weight, bias, activation_, relux_max_limit_, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLFullyConnectedKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+void RegisterFullyConnected(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "FullyConnected",
+                   FullyConnectedOp, DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "FullyConnected",
+                   FullyConnectedOp, DeviceType::CPU, uint8_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "FullyConnected",
+                   FullyConnectedOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "FullyConnected",
+                   FullyConnectedOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/fully_connected.h b/mace/kernels/fully_connected.h
deleted file mode 100644
index 20a572cb..00000000
--- a/mace/kernels/fully_connected.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_FULLY_CONNECTED_H_
-#define MACE_KERNELS_FULLY_CONNECTED_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/gemm.h"
-#include "mace/kernels/gemmlowp_util.h"
-
-namespace mace {
-namespace kernels {
-
-struct FullyConnectedBase : OpKernel {
-  FullyConnectedBase(OpKernelContext *context,
-                     const ActivationType activation,
-                     const float relux_max_limit)
-      : OpKernel(context),
-        activation_(activation),
-        relux_max_limit_(relux_max_limit) {}
-
-  const ActivationType activation_;
-  const float relux_max_limit_;
-};
-
-template <DeviceType D, typename T>
-struct FullyConnectedFunctor;
-
-template <>
-struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
-  FullyConnectedFunctor(OpKernelContext *context,
-                        const ActivationType activation,
-                        const float relux_max_limit)
-      : FullyConnectedBase(context, activation, relux_max_limit) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *weight,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    std::vector<index_t> output_shape = {input->dim(0), weight->dim(0), 1, 1};
-    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-    const index_t N = output->dim(0);
-    const index_t input_size = weight->dim(1) * weight->dim(2) * weight->dim(3);
-    const index_t output_size = weight->dim(0);
-
-    Tensor::MappingGuard guard_input(input);
-    Tensor::MappingGuard guard_weight(weight);
-    Tensor::MappingGuard guard_output(output);
-    const float *input_ptr = input->data<float>();
-    const float *weight_ptr = weight->data<float>();
-    float *output_ptr = output->mutable_data<float>();
-
-    Gemv(weight_ptr, input_ptr, N, input_size, output_size, output_ptr);
-
-    if (bias) {
-      Tensor::MappingGuard guard_bias(bias);
-      const float *bias_ptr = bias == nullptr ? nullptr : bias->data<float>();
-      for (int i = 0; i < N; ++i) {
-        for (int j = 0; j < output_size; ++j) {
-          output_ptr[j + i * output_size] += bias_ptr[j];
-        }
-      }
-    }
-
-    DoActivation(output_ptr, output_ptr, output->size(), activation_,
-                 relux_max_limit_);
-
-    return MACE_SUCCESS;
-  }
-};
-
-template <>
-struct FullyConnectedFunctor<DeviceType::CPU, uint8_t>: FullyConnectedBase {
-  FullyConnectedFunctor(OpKernelContext *context,
-                        const ActivationType activation,
-                        const float relux_max_limit)
-      : FullyConnectedBase(context, activation, relux_max_limit) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *weight,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
-    MACE_CHECK_NOTNULL(gemm_context);
-
-    std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
-    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-    const int N = static_cast<int>(output->dim(0));
-    const int input_size =
-        static_cast<int>(weight->dim(1) * weight->dim(2) * weight->dim(3));
-    const int output_size = static_cast<int>(weight->dim(0));
-
-    Tensor::MappingGuard guard_input(input);
-    Tensor::MappingGuard guard_weight(weight);
-    Tensor::MappingGuard guard_output(output);
-    auto input_ptr = input->data<uint8_t>();
-    auto weight_ptr = weight->data<uint8_t>();
-    auto output_ptr = output->mutable_data<uint8_t>();
-
-    std::vector<index_t> bias_shape{output_size};
-    std::unique_ptr<Tensor> zero_bias;
-    const int32_t *bias_ptr = nullptr;
-    if (bias == nullptr) {
-      zero_bias.reset(
-          new Tensor(GetCPUAllocator(), DT_INT32));
-      zero_bias->Resize(bias_shape);
-      zero_bias->Clear();
-      bias_ptr = zero_bias->data<int32_t>();
-    } else {
-      bias_ptr = bias->data<int32_t>();
-    }
-
-    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor>
-        weight_matrix(weight_ptr, output_size, input_size);
-    gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor>
-        input_matrix(input_ptr, input_size, N);
-    gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor>
-        output_matrix(output_ptr, output_size, N);
-
-    const auto &output_pipeline = GemmlowpOutputPipeline::Make(
-        bias_ptr, output_size, weight->scale(), input->scale(), output->scale(),
-        output->zero_point());
-
-    using BitDepthParams = gemmlowp::L8R8WithLhsNonzeroBitDepthParams;
-    gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, BitDepthParams>(
-        gemm_context, weight_matrix, input_matrix, &output_matrix,
-        -weight->zero_point(), -input->zero_point(), output_pipeline);
-
-    return MACE_SUCCESS;
-  }
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLFullyConnectedKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *weight,
-      const Tensor *bias,
-      const ActivationType activation,
-      const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLFullyConnectedKernel);
-};
-template <typename T>
-struct FullyConnectedFunctor<DeviceType::GPU, T> : FullyConnectedBase {
-  FullyConnectedFunctor(OpKernelContext *context,
-                        const ActivationType activation,
-                        const float relux_max_limit);
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *weight,
-                        const Tensor *bias,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLFullyConnectedKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_FULLY_CONNECTED_H_
diff --git a/mace/kernels/gather.h b/mace/kernels/gather.cc
similarity index 76%
rename from mace/kernels/gather.h
rename to mace/kernels/gather.cc
index d8978a63..ff947e82 100644
--- a/mace/kernels/gather.h
+++ b/mace/kernels/gather.cc
@@ -12,43 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_GATHER_H_
-#define MACE_KERNELS_GATHER_H_
-
 #include <algorithm>
-#include <cmath>
-#include <functional>
-#include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
-struct GatherBase : OpKernel {
-  GatherBase(OpKernelContext *context, int axis, float y)
-      : OpKernel(context), axis_(axis), y_(y) {}
-
-  int axis_;
-  float y_;
-};
-
-template <DeviceType D, typename T>
-struct GatherFunctor;
+template <DeviceType D, class T>
+class GatherOp;
 
 template <>
-struct GatherFunctor<DeviceType::CPU, float> : GatherBase {
-  GatherFunctor(OpKernelContext *context, int axis, float y)
-      : GatherBase(context, axis, y) {}
-
-  MaceStatus operator()(const Tensor *params,
-                        const Tensor *indices,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+class GatherOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit GatherOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 0)),
+        y_(Operation::GetOptionalArg<float>("y", 1.0)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *params = this->Input(PARAMS);
+    const Tensor *indices = this->Input(INDICES);
+    Tensor *output = this->Output(OUTPUT);
     std::vector<index_t> output_shape;
     if (axis_ < 0) {
       axis_ += params->dim_size();
@@ -99,11 +85,20 @@ struct GatherFunctor<DeviceType::CPU, float> : GatherBase {
       }
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
+
+ private:
+  int axis_;
+  float y_;
+  MACE_OP_INPUT_TAGS(PARAMS, INDICES);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
+void RegisterGather(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Gather", GatherOp,
+                   DeviceType::CPU, float);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_GATHER_H_
diff --git a/mace/kernels/gemmlowp_util.h b/mace/kernels/gemmlowp_util.h
index 86335955..8a0148e1 100644
--- a/mace/kernels/gemmlowp_util.h
+++ b/mace/kernels/gemmlowp_util.h
@@ -18,7 +18,8 @@
 #include <tuple>
 
 #include "public/gemmlowp.h"
-#include "mace/kernels/quantize.h"
+#include "mace/core/types.h"
+#include "mace/utils/quantize.h"
 
 namespace mace {
 
diff --git a/mace/kernels/identity.cc b/mace/kernels/identity.cc
new file mode 100644
index 00000000..1fba94bd
--- /dev/null
+++ b/mace/kernels/identity.cc
@@ -0,0 +1,50 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "mace/core/operator.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, class T>
+class IdentityOp : public Operation {
+ public:
+  explicit IdentityOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    output->ReuseTensorBuffer(*input);
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+void RegisterIdentity(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
+                   DeviceType::CPU, int32_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
+                   DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "Identity", IdentityOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/ops/infer_conv2d_shape.h b/mace/kernels/infer_conv2d_shape.cc
similarity index 68%
rename from mace/ops/infer_conv2d_shape.h
rename to mace/kernels/infer_conv2d_shape.cc
index 6d1fdf4f..0e80aa61 100644
--- a/mace/ops/infer_conv2d_shape.h
+++ b/mace/kernels/infer_conv2d_shape.cc
@@ -12,44 +12,41 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_INFER_CONV2D_SHAPE_H_
-#define MACE_OPS_INFER_CONV2D_SHAPE_H_
-
-#include <vector>
 
 #include "mace/core/operator.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
-template <DeviceType D, typename T>
-class InferConv2dShapeOp : public Operator<D, T> {
+template <DeviceType D, class T>
+class InferConv2dShapeOp : public Operation {
  public:
-  InferConv2dShapeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context) {}
+  explicit InferConv2dShapeOp(OpConstructContext *context)
+      : Operation(context) {}
 
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
     MACE_CHECK(input->dim_size() == 4);
     output->Resize({input->dim_size()});
     Tensor::MappingGuard output_guard(output);
     int32_t *output_data = output->mutable_data<int32_t>();
 
     const int32_t data_format =
-        OperatorBase::GetOptionalArg<int>("data_format", 0);
+        Operation::GetOptionalArg<int>("data_format", 0);
     const bool isNCHW = data_format == 1;
 
     Padding padding_type =
-        static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
+        static_cast<Padding>(Operation::GetOptionalArg<int>(
             "padding", static_cast<int>(SAME)));
     const std::vector<int32_t> paddings =
-        OperatorBase::GetRepeatedArgs<int32_t>("padding_values");
+        Operation::GetRepeatedArgs<int32_t>("padding_values");
     const std::vector<int32_t> kernels =
-        OperatorBase::GetRepeatedArgs<int32_t>("kernels");
+        Operation::GetRepeatedArgs<int32_t>("kernels");
     const std::vector<int32_t> strides =
-        OperatorBase::GetRepeatedArgs<int32_t>("strides", {1, 1});
+        Operation::GetRepeatedArgs<int32_t>("strides", {1, 1});
     const int32_t out_batch = static_cast<int32_t>(input->dim(0));
     const int32_t out_channel = static_cast<int32_t>(kernels[0]);
 
@@ -97,17 +94,22 @@ class InferConv2dShapeOp : public Operator<D, T> {
       output_data[3] = out_channel;
     }
 
-    SetFutureDefaultWaitFn(future);
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
-
- private:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-}  // namespace ops
-}  // namespace mace
+void RegisterInferConv2dShape(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
+                   InferConv2dShapeOp, DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
+                   InferConv2dShapeOp, DeviceType::CPU, int32_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
+                   InferConv2dShapeOp, DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "InferConv2dShape",
+                   InferConv2dShapeOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 
-#endif  // MACE_OPS_INFER_CONV2D_SHAPE_H_
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/local_response_norm.h b/mace/kernels/local_response_norm.cc
similarity index 56%
rename from mace/kernels/local_response_norm.h
rename to mace/kernels/local_response_norm.cc
index d53b8f08..6a51ccb3 100644
--- a/mace/kernels/local_response_norm.h
+++ b/mace/kernels/local_response_norm.cc
@@ -12,40 +12,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_LOCAL_RESPONSE_NORM_H_
-#define MACE_KERNELS_LOCAL_RESPONSE_NORM_H_
-
 #include <algorithm>
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
 
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct LocalResponseNormFunctor;
-
-template<>
-struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
-  explicit LocalResponseNormFunctor(OpKernelContext *context)
-      : OpKernel(context) {}
-  MaceStatus operator()(const Tensor *input,
-                  int depth_radius,
-                  float bias,
-                  float alpha,
-                  float beta,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
+template <DeviceType D, class T>
+class LocalResponseNormOp;
+
+template <>
+class LocalResponseNormOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit LocalResponseNormOp(OpConstructContext *context)
+      : Operation(context),
+        depth_radius_(Operation::GetOptionalArg<int>("depth_radius", 5)),
+        bias_(Operation::GetOptionalArg<float>("bias", 1.0f)),
+        alpha_(Operation::GetOptionalArg<float>("alpha", 1.0f)),
+        beta_(Operation::GetOptionalArg<float>("beta", 0.5f)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
+               input->dim_size());
+
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
     const index_t batch = input->dim(0);
     const index_t channels = input->dim(1);
     const index_t height = input->dim(2);
@@ -61,8 +57,8 @@ struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
     for (index_t b = 0; b < batch; ++b) {
       for (index_t c = 0; c < channels; ++c) {
         const int begin_input_c = std::max(static_cast<index_t>(0),
-                                           c - depth_radius);
-        const int end_input_c = std::min(channels, c + depth_radius + 1);
+                                           c - depth_radius_);
+        const int end_input_c = std::min(channels, c + depth_radius_ + 1);
 
         index_t pos = b * batch_size;
         for (index_t hw = 0; hw < height * width; ++hw, ++pos) {
@@ -71,18 +67,27 @@ struct LocalResponseNormFunctor<DeviceType::CPU, float> : OpKernel {
             const float input_val = input_ptr[pos + input_c * image_size];
             accum += input_val * input_val;
           }
-          const float multiplier = std::pow(bias + alpha * accum, -beta);
+          const float multiplier = std::pow(bias_ + alpha_ * accum, -beta_);
           output_ptr[pos + c * image_size] =
-            input_ptr[pos + c * image_size] * multiplier;
+              input_ptr[pos + c * image_size] * multiplier;
         }
       }
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
+
+ private:
+  int depth_radius_;
+  float bias_;
+  float alpha_;
+  float beta_;
 };
 
+void RegisterLocalResponseNorm(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "LocalResponseNorm",
+                   LocalResponseNormOp, DeviceType::CPU, float);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_LOCAL_RESPONSE_NORM_H_
diff --git a/mace/ops/lstmcell.h b/mace/kernels/lstm_cell.cc
similarity index 50%
rename from mace/ops/lstmcell.h
rename to mace/kernels/lstm_cell.cc
index 3037c891..be7f50d9 100644
--- a/mace/ops/lstmcell.h
+++ b/mace/kernels/lstm_cell.cc
@@ -12,28 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_LSTMCELL_H_
-#define MACE_OPS_LSTMCELL_H_
-
-#include <vector>
+#include <algorithm>
+#include <memory>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/lstmcell.h"
+#include "mace/kernels/opencl/image/lstm_cell.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
 template <DeviceType D, class T>
-class LSTMCellOp : public Operator<D, T> {
+class LSTMCellOp;
+
+template <typename T>
+class LSTMCellOp<DeviceType::GPU, T> : public Operation {
  public:
-  LSTMCellOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 static_cast<T>(
-                     OperatorBase::GetOptionalArg<float>("scalar_input",
-                                                         0.0))) {}
-
-  MaceStatus Run(StatsFuture *future) override {
+  explicit LSTMCellOp(OpConstructContext *context)
+      : Operation(context) {
+    T forget_bias = static_cast<T>(
+                     Operation::GetOptionalArg<float>("scalar_input",
+                                                         0.0));
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
     const Tensor *input = this->Input(INPUT);
     const Tensor *pre_output = this->Input(PRE_OUTPUT);
     const Tensor *weight = this->Input(WEIGHT);
@@ -41,19 +47,24 @@ class LSTMCellOp : public Operator<D, T> {
     const Tensor *pre_cell = this->Input(PRE_CELL);
     Tensor *cell = this->Output(CELL);
     Tensor *output = this->Output(OUTPUT);
+    return kernel_->Compute(context, input, pre_output, weight, bias,
+                            pre_cell, cell, output);
+  }
 
-    return functor_(
-        input, pre_output, weight, bias, pre_cell, cell, output, future);
-  };
-
- protected:
-  kernels::LSTMCellFunctor<D, T> functor_;
+ private:
+  std::unique_ptr<OpenCLLSTMCellKernel> kernel_;
 
   MACE_OP_INPUT_TAGS(INPUT, PRE_OUTPUT, WEIGHT, BIAS, PRE_CELL);
   MACE_OP_OUTPUT_TAGS(CELL, OUTPUT);
 };
 
-}  // namespace ops
-}  // namespace mace
+void RegisterLSTMCell(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "LSTMCell", LSTMCellOp,
+                   DeviceType::GPU, half);
+}
 
-#endif  // MACE_OPS_LSTMCELL_H_
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/lstmcell.h b/mace/kernels/lstmcell.h
deleted file mode 100644
index 81a7f386..00000000
--- a/mace/kernels/lstmcell.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_LSTMCELL_H_
-#define MACE_KERNELS_LSTMCELL_H_
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace mace {
-namespace kernels {
-
-template <DeviceType D, typename T>
-struct LSTMCellFunctor;
-
-class OpenCLLSTMCellKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const Tensor *pre_output,
-      const Tensor *weight,
-      const Tensor *bias,
-      const Tensor *pre_cell,
-      Tensor *cell,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLLSTMCellKernel);
-};
-template <typename T>
-struct LSTMCellFunctor<DeviceType::GPU, T> : OpKernel{
-  LSTMCellFunctor(OpKernelContext *context, T forget_bias);
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *pre_output,
-                        const Tensor *weight,
-                        const Tensor *bias,
-                        const Tensor *pre_cell,
-                        Tensor *cell,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLLSTMCellKernel> kernel_;
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_LSTMCELL_H_
diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.cc
similarity index 57%
rename from mace/kernels/matmul.h
rename to mace/kernels/matmul.cc
index 5dab02c5..4723e655 100644
--- a/mace/kernels/matmul.h
+++ b/mace/kernels/matmul.cc
@@ -12,13 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_MATMUL_H_
-#define MACE_KERNELS_MATMUL_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
 #include <algorithm>
 #include <utility>
 #include <functional>
@@ -26,27 +19,65 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/future.h"
+#include "mace/core/operator.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/gemm.h"
-#include "mace/kernels/kernel.h"
-#include "mace/utils/utils.h"
 #include "mace/kernels/gemmlowp_util.h"
 #include "mace/kernels/sgemm.h"
+#include "mace/utils/utils.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/matmul.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct MatMulFunctor : OpKernel {
-  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const Tensor *A,
-                        const Tensor *B,
-                        Tensor *C,
-                        bool transpose_a,
-                        bool transpose_b,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+class MatMulOpBase : public Operation {
+ public:
+  explicit MatMulOpBase(OpConstructContext *context)
+      : Operation(context),
+        transpose_a_(Operation::GetOptionalArg<bool>("transpose_a", false)),
+        transpose_b_(Operation::GetOptionalArg<bool>("transpose_b", false)) {}
+
+  inline void Validate() {
+    const Tensor *A = this->Input(INPUT_A);
+    const Tensor *B = this->Input(INPUT_B);
+    MACE_CHECK(A->dim_size() == B->dim_size() && A->dim_size() >= 2,
+               "rank(A) should be equal to rank(B), rank should be greater "
+               "than or equal to 2");
+    index_t rank = A->dim_size();
+    for (index_t i = 0; i < rank - 2; ++i) {
+      MACE_CHECK(A->dim(i) == B->dim(i),
+                 "batch dimensions are not equal: ",
+                 A->dim(i),
+                 " vs. ",
+                 B->dim(i));
+    }
+    index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1);
+    index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2);
+    MACE_CHECK(ak == bk, "the number of A's column ", ak,
+               " must be equal to B's row ", bk);
+  }
+
+ protected:
+  MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+
+  bool transpose_a_;
+  bool transpose_b_;
+};
+
+template <DeviceType D, class T>
+class MatMulOp : public MatMulOpBase {
+ public:
+  explicit MatMulOp(OpConstructContext *context)
+      : MatMulOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    Validate();
+    const Tensor *A = this->Input(INPUT_A);
+    const Tensor *B = this->Input(INPUT_B);
+    Tensor *C = this->Output(OUTPUT);
 
     index_t batch;
     index_t height;
@@ -56,10 +87,10 @@ struct MatMulFunctor : OpKernel {
     index_t rank = A->dim_size();
     height = A->dim(rank - 2);
     K = A->dim(rank - 1);
-    if (transpose_a) {
+    if (transpose_a_) {
       std::swap(height, K);
     }
-    if (transpose_b) {
+    if (transpose_b_) {
       width = B->dim(rank - 2);
     } else {
       width = B->dim(rank - 1);
@@ -85,7 +116,7 @@ struct MatMulFunctor : OpKernel {
     const index_t height_b = B->dim(rank - 2);
     const index_t width_b = B->dim(rank - 1);
 
-    auto scratch_buffer = context_->device()->scratch_buffer();
+    auto scratch_buffer = context->device()->scratch_buffer();
     scratch_buffer->Rewind();
     index_t scratch_size = C->raw_max_size();
     if (!A->is_weight()) {
@@ -103,30 +134,86 @@ struct MatMulFunctor : OpKernel {
                width_a,
                height_b,
                width_b,
-               transpose_a,
-               transpose_b,
+               transpose_a_,
+               transpose_b_,
                A->is_weight(),
                B->is_weight(),
                c_ptr_base,
-               context_->device()->scratch_buffer());
-    return MACE_SUCCESS;
+               context->device()->scratch_buffer());
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   SGemm sgemm_;
 };
 
 template <>
-struct MatMulFunctor<CPU, uint8_t> : OpKernel {
-  explicit MatMulFunctor(OpKernelContext *context) : OpKernel(context) {}
+class MatMulOp<DeviceType::CPU, uint8_t>: public MatMulOpBase {
+ public:
+  explicit MatMulOp(OpConstructContext *context)
+      : MatMulOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    Validate();
+    const Tensor *A = this->Input(INPUT_A);
+    const Tensor *B = this->Input(INPUT_B);
+    Tensor *C = this->Output(OUTPUT);
+
+    index_t rank = A->dim_size();
+    index_t height = A->dim(rank - 2);
+    index_t K = A->dim(rank - 1);
+    index_t width;
+
+    if (transpose_a_) {
+      std::swap(height, K);
+    }
+    if (transpose_b_) {
+      width = B->dim(rank - 2);
+    } else {
+      width = B->dim(rank - 1);
+    }
+
+    std::vector<index_t> c_shape = A->shape();
+    c_shape[rank - 2] = height;
+    c_shape[rank - 1] = width;
+
+    MACE_RETURN_IF_ERROR(C->Resize(c_shape));
+
+    constexpr gemmlowp::MapOrder kRowMajor = gemmlowp::MapOrder::RowMajor;
+    constexpr gemmlowp::MapOrder kColMajor = gemmlowp::MapOrder::ColMajor;
+
+#define MATMUL_IMPL(AOrder, BOrder) \
+    MatMulImpl<AOrder, BOrder>(context, A, B, height, K, width, C);
+
+    if (transpose_a_) {
+      if (transpose_b_) {
+        MATMUL_IMPL(kColMajor, kColMajor);
+      } else {
+        MATMUL_IMPL(kColMajor, kRowMajor);
+      }
+    } else {
+      if (transpose_b_) {
+        MATMUL_IMPL(kRowMajor, kColMajor);
+      } else {
+        MATMUL_IMPL(kRowMajor, kRowMajor);
+      }
+    }
+
+#undef MATMUL_IMPL
+
+    return MaceStatus::MACE_SUCCESS;
+  }
 
+ private:
   template<gemmlowp::MapOrder AOrder, gemmlowp::MapOrder BOrder>
-  void MatMulImpl(const Tensor *A,
+  void MatMulImpl(OpContext *context,
+                  const Tensor *A,
                   const Tensor *B,
                   const index_t height,
                   const index_t K,
                   const index_t width,
                   Tensor *C) {
-    auto gemm_context = context_->device()->cpu_runtime()->GetGemmlowpContext();
+    auto gemm_context = context->device()->cpu_runtime()->GetGemmlowpContext();
     MACE_CHECK_NOTNULL(gemm_context);
 
     Tensor::MappingGuard guarda(A);
@@ -158,90 +245,48 @@ struct MatMulFunctor<CPU, uint8_t> : OpKernel {
           -B->zero_point(), output_pipeline);
     }
   }
+};
 
-  MaceStatus operator()(const Tensor *A,
-                        const Tensor *B,
-                        Tensor *C,
-                        bool transpose_a,
-                        bool transpose_b,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-
-    index_t rank = A->dim_size();
-    index_t height = A->dim(rank - 2);
-    index_t K = A->dim(rank - 1);
-    index_t width;
-
-    if (transpose_a) {
-      std::swap(height, K);
-    }
-    if (transpose_b) {
-      width = B->dim(rank - 2);
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
+ public:
+  explicit MatMulOp(OpConstructContext *context)
+      : MatMulOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::MatMulKernel<T>);
     } else {
-      width = B->dim(rank - 1);
+      MACE_NOT_IMPLEMENTED;
     }
+  }
+  MaceStatus Run(OpContext *context) override {
+    Validate();
+    const Tensor *A = this->Input(INPUT_A);
+    const Tensor *B = this->Input(INPUT_B);
+    Tensor *C = this->Output(OUTPUT);
+    return kernel_->Compute(context, A, B, C, transpose_a_, transpose_b_);
+  }
 
-    std::vector<index_t> c_shape = A->shape();
-    c_shape[rank - 2] = height;
-    c_shape[rank - 1] = width;
-
-    MACE_RETURN_IF_ERROR(C->Resize(c_shape));
-
-    constexpr gemmlowp::MapOrder kRowMajor = gemmlowp::MapOrder::RowMajor;
-    constexpr gemmlowp::MapOrder kColMajor = gemmlowp::MapOrder::ColMajor;
-
-#define MATMUL_IMPL(AOrder, BOrder) \
-    MatMulImpl<AOrder, BOrder>(A, B, height, K, width, C);
-
-    if (transpose_a) {
-      if (transpose_b) {
-        MATMUL_IMPL(kColMajor, kColMajor);
-      } else {
-        MATMUL_IMPL(kColMajor, kRowMajor);
-      }
-    } else {
-      if (transpose_b) {
-        MATMUL_IMPL(kRowMajor, kColMajor);
-      } else {
-        MATMUL_IMPL(kRowMajor, kRowMajor);
-      }
-    }
+ private:
+  std::unique_ptr<OpenCLMatMulKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
 
-#undef MATMUL_IMPL
 
-    return MACE_SUCCESS;
-  }
-};
+void RegisterMatMul(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
+                   DeviceType::CPU, float);
 
+  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
+                   DeviceType::CPU, uint8_t);
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLMatMulKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *A,
-      const Tensor *B,
-      Tensor *C,
-      bool transpose_a,
-      bool transpose_b,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLMatMulKernel);
-};
-template <typename T>
-struct MatMulFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit MatMulFunctor(OpKernelContext *context);
-
-  MaceStatus operator()(const Tensor *A,
-                        const Tensor *B,
-                        Tensor *C,
-                        bool transpose_a,
-                        bool transpose_b,
-                        StatsFuture *future);
+  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
+                   DeviceType::GPU, float);
 
-  std::unique_ptr<OpenCLMatMulKernel> kernel_;
-};
+  MACE_REGISTER_OP(op_registry, "MatMul", MatMulOp,
+                   DeviceType::GPU, half);
 #endif  // MACE_ENABLE_OPENCL
+}
 
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_MATMUL_H_
diff --git a/mace/kernels/opencl/activation.cc b/mace/kernels/opencl/activation.cc
deleted file mode 100644
index 14c014ba..00000000
--- a/mace/kernels/opencl/activation.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/activation.h"
-
-#include "mace/kernels/opencl/image/activation.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-ActivationFunctor<DeviceType::GPU, T>::ActivationFunctor(
-    OpKernelContext *context,
-    ActivationType type,
-    T relux_max_limit) : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(
-        new opencl::image::ActivationKernel<T>(type, relux_max_limit));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-template <typename T>
-MaceStatus ActivationFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const Tensor *alpha,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, alpha, output, future);
-}
-
-template struct ActivationFunctor<DeviceType::GPU, float>;
-template struct ActivationFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/activation.h b/mace/kernels/opencl/activation.h
new file mode 100644
index 00000000..35f1785c
--- /dev/null
+++ b/mace/kernels/opencl/activation.h
@@ -0,0 +1,40 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_ACTIVATION_H_
+#define MACE_KERNELS_OPENCL_ACTIVATION_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLActivationKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *alpha,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLActivationKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_ACTIVATION_H_
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
deleted file mode 100644
index af3d18d5..00000000
--- a/mace/kernels/opencl/addn.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/addn.h"
-
-#include "mace/kernels/opencl/image/addn.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-AddNFunctor<DeviceType::GPU, T>::AddNFunctor(OpKernelContext *context)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(
-        new opencl::image::AddNKernel<T>);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus AddNFunctor<DeviceType::GPU, T>::operator()(
-    const std::vector<const Tensor *> &input_tensors,
-    Tensor *output_tensor,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input_tensors, output_tensor, future);
-}
-
-template struct AddNFunctor<DeviceType::GPU, float>;
-template struct AddNFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/addn.h b/mace/kernels/opencl/addn.h
new file mode 100644
index 00000000..908ff113
--- /dev/null
+++ b/mace/kernels/opencl/addn.h
@@ -0,0 +1,42 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_ADDN_H_
+#define MACE_KERNELS_OPENCL_ADDN_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+
+class OpenCLAddNKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const std::vector<const Tensor *> &input_tensors,
+      Tensor *output_tensor) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLAddNKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_ADDN_H_
diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc
deleted file mode 100644
index c09f8eb2..00000000
--- a/mace/kernels/opencl/batch_norm.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/batch_norm.h"
-#include "mace/kernels/opencl/image/batch_norm.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-BatchNormFunctor<DeviceType::GPU, T>::BatchNormFunctor(
-    OpKernelContext *context,
-    const bool folded_constant,
-    const ActivationType activation,
-    const float relux_max_limit)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::BatchNormKernel<T>(
-        folded_constant, activation, relux_max_limit));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus BatchNormFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const Tensor *scale,
-    const Tensor *offset,
-    const Tensor *mean,
-    const Tensor *var,
-    const float epsilon,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, scale, offset, mean,
-                          var, epsilon, output, future);
-}
-
-template struct BatchNormFunctor<DeviceType::GPU, float>;
-template struct BatchNormFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/batch_norm.h b/mace/kernels/opencl/batch_norm.h
new file mode 100644
index 00000000..b97dfe6e
--- /dev/null
+++ b/mace/kernels/opencl/batch_norm.h
@@ -0,0 +1,43 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_BATCH_NORM_H_
+#define MACE_KERNELS_OPENCL_BATCH_NORM_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLBatchNormKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *scale,
+      const Tensor *offset,
+      const Tensor *mean,
+      const Tensor *var,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBatchNormKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BATCH_NORM_H_
diff --git a/mace/kernels/opencl/batch_to_space.cc b/mace/kernels/opencl/batch_to_space.cc
deleted file mode 100644
index 7fe533eb..00000000
--- a/mace/kernels/opencl/batch_to_space.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
-#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
-
-#include "mace/kernels/batch_to_space.h"
-#include "mace/kernels/opencl/image/batch_to_space.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-BatchToSpaceFunctor<DeviceType::GPU, T>::BatchToSpaceFunctor(
-    OpKernelContext *context,
-    const std::vector<int> &paddings,
-    const std::vector<int> &block_shape)
-    : BatchToSpaceFunctorBase(context, paddings, block_shape) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::BatchToSpaceKernel<T>);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-template <typename T>
-MaceStatus BatchToSpaceFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *batch_tensor, Tensor *space_tensor, StatsFuture *future) {
-  std::vector<index_t> output_shape(4, 0);
-  CalculateBatchToSpaceOutputShape(batch_tensor, DataFormat::NHWC,
-                                   output_shape.data());
-  return kernel_->Compute(context_, batch_tensor, paddings_, block_shape_,
-                          output_shape, space_tensor, future);
-}
-
-template struct BatchToSpaceFunctor<DeviceType::GPU, float>;
-template struct BatchToSpaceFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
-#endif  // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
diff --git a/mace/kernels/reshape.h b/mace/kernels/opencl/batch_to_space.h
similarity index 52%
rename from mace/kernels/reshape.h
rename to mace/kernels/opencl/batch_to_space.h
index f0ab1bf5..9f155336 100644
--- a/mace/kernels/reshape.h
+++ b/mace/kernels/opencl/batch_to_space.h
@@ -12,35 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_RESHAPE_H_
-#define MACE_KERNELS_RESHAPE_H_
+#ifndef MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
+#define MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
 
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
+#include "mace/core/types.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
-namespace kernels {
-
-template <DeviceType D, typename T>
-struct ReshapeFunctor : OpKernel {
-  explicit ReshapeFunctor(OpKernelContext *context) : OpKernel(context) {}
 
-  MaceStatus operator()(const Tensor *input,
-                  const std::vector<index_t> &out_shape,
-                  Tensor *output,
-                  StatsFuture *future) {
-    MACE_UNUSED(future);
-    output->ReuseTensorBuffer(*input);
-    output->Reshape(out_shape);
+class OpContext;
+class Tensor;
 
-    return MACE_SUCCESS;
-  }
+namespace kernels {
+class OpenCLBatchToSpaceKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *batch_tensor,
+      const std::vector<int> &paddings,
+      const std::vector<int> &block_shape,
+      const std::vector<index_t> &output_shape,
+      Tensor *space_tensor) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBatchToSpaceKernel);
 };
 
 }  // namespace kernels
 }  // namespace mace
 
-#endif  // MACE_KERNELS_RESHAPE_H_
+#endif  // MACE_KERNELS_OPENCL_BATCH_TO_SPACE_H_
diff --git a/mace/kernels/opencl/bias_add.cc b/mace/kernels/opencl/bias_add.cc
deleted file mode 100644
index 6904eed9..00000000
--- a/mace/kernels/opencl/bias_add.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/bias_add.h"
-#include "mace/kernels/opencl/image/bias_add.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-BiasAddFunctor<DeviceType::GPU, T>::BiasAddFunctor(
-    OpKernelContext *context,
-    const DataFormat data_format)
-    : BiasAddFunctorBase(context, data_format) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::BiasAddKernel<T>);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus BiasAddFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
-                                                          const Tensor *bias,
-                                                          Tensor *output,
-                                                          StatsFuture *future) {
-  MACE_CHECK(input->dim_size() == 4 && data_format_ == NHWC,
-             "gpu only support biasadd for 4-dimensional NHWC format tensor");
-  return kernel_->Compute(context_, input, bias, output, future);
-}
-
-template struct BiasAddFunctor<DeviceType::GPU, float>;
-template struct BiasAddFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/bias_add.h b/mace/kernels/opencl/bias_add.h
new file mode 100644
index 00000000..1a0a1050
--- /dev/null
+++ b/mace/kernels/opencl/bias_add.h
@@ -0,0 +1,40 @@
+// Copyright 2017 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_BIAS_ADD_H_
+#define MACE_KERNELS_OPENCL_BIAS_ADD_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLBiasAddKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *bias,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBiasAddKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BIAS_ADD_H_
diff --git a/mace/kernels/opencl/buffer/buffer_inverse_transform.h b/mace/kernels/opencl/buffer/buffer_inverse_transform.h
index 93bd22a9..29e63143 100644
--- a/mace/kernels/opencl/buffer/buffer_inverse_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_inverse_transform.h
@@ -15,7 +15,10 @@
 #ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
 #define MACE_KERNELS_OPENCL_BUFFER_BUFFER_INVERSE_TRANSFORM_H_
 
-#include "mace/kernels/buffer_inverse_transform.h"
+#include "mace/kernels/opencl/buffer_inverse_transform.h"
+
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -24,40 +27,37 @@ namespace opencl {
 namespace buffer {
 
 MaceStatus BufferTypeTransform(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future);
+    Tensor *output);
 
 template <typename T>
 class BufferInverseTransform: public OpenCLBufferInverseTransformKernel {
  public:
-  MaceStatus Compute(OpKernelContext *context,
+  MaceStatus Compute(OpContext *context,
                      const Tensor *input,
                      const BufferType type,
                      const int wino_blk_size,
-                     Tensor *output,
-                     StatsFuture *future) override;
+                     Tensor *output) override;
  private:
   cl::Kernel kernel_;
 };
 
 template <typename T>
-MaceStatus BufferInverseTransform<T>::Compute(OpKernelContext *context,
+MaceStatus BufferInverseTransform<T>::Compute(OpContext *context,
                                               const Tensor *input,
                                               const BufferType type,
                                               const int wino_blk_size,
-                                              Tensor *output,
-                                              StatsFuture *future) {
+                                              Tensor *output) {
   MACE_UNUSED(type);
   MACE_UNUSED(wino_blk_size);
   const DataType dt = DataTypeToEnum<T>::value;
   if (input->dtype() != output->dtype()) {
-    return BufferTypeTransform(context, &kernel_, input, dt, output, future);
+    return BufferTypeTransform(context, &kernel_, input, dt, output);
   } else {
-    SetFutureDefaultWaitFn(future);
+    SetFutureDefaultWaitFn(context->future());
     output->ReuseTensorBuffer(*input);
     return MaceStatus::MACE_SUCCESS;
   }
diff --git a/mace/kernels/opencl/buffer/buffer_transform.cc b/mace/kernels/opencl/buffer/buffer_transform.cc
index 73ee521c..7e5897a5 100644
--- a/mace/kernels/opencl/buffer/buffer_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_transform.cc
@@ -24,12 +24,11 @@ namespace opencl {
 namespace buffer {
 
 MaceStatus TransformConv2DFilter(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t out_chan = input->dim(0);
   const index_t in_chan = input->dim(1);
   const index_t filter_height = input->dim(2);
@@ -90,20 +89,19 @@ MaceStatus TransformConv2DFilter(
              transformed_shape[3]);
   std::vector<uint32_t> lws = {4, 4, 4, 0};
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION
   // Mark the buffer unused.
   const_cast<Tensor *>(input)->MarkUnused();
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus TransformDWConv2DFilter(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t multiplier = input->dim(0);
   const index_t in_chan = input->dim(1);
   const index_t filter_height = input->dim(2);
@@ -159,20 +157,19 @@ MaceStatus TransformDWConv2DFilter(
              transformed_shape[3]);
   std::vector<uint32_t> lws = {4, 4, 4, 0};
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION
   // Mark the buffer unused.
   const_cast<Tensor *>(input)->MarkUnused();
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus TransformArgument(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t size = input->dim(0);
 
   std::vector<index_t> transformed_shape = {RoundUp<index_t>(size, 4)};
@@ -225,8 +222,8 @@ MaceStatus TransformArgument(
   }
   MACE_CL_RET_STATUS(error);
   MACE_OUT_OF_RANGE_VALIDATION
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -235,7 +232,7 @@ MaceStatus TransformArgument(
   }
   // Mark the buffer unused.
   const_cast<Tensor *>(input)->MarkUnused();
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace buffer
diff --git a/mace/kernels/opencl/buffer/buffer_transform.h b/mace/kernels/opencl/buffer/buffer_transform.h
index 4c56f316..4a2213e4 100644
--- a/mace/kernels/opencl/buffer/buffer_transform.h
+++ b/mace/kernels/opencl/buffer/buffer_transform.h
@@ -15,9 +15,12 @@
 #ifndef MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
 #define MACE_KERNELS_OPENCL_BUFFER_BUFFER_TRANSFORM_H_
 
+#include "mace/kernels/opencl/buffer_transform.h"
+
 #include <vector>
 
-#include "mace/kernels/buffer_transform.h"
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -26,48 +29,43 @@ namespace opencl {
 namespace buffer {
 
 MaceStatus BufferTypeTransform(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future);
+    Tensor *output);
 
 MaceStatus TransformConv2DFilter(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future);
+    Tensor *output);
 
 MaceStatus TransformDWConv2DFilter(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future);
+    Tensor *output);
 
 MaceStatus TransformArgument(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future);
+    Tensor *output);
 
 
 template <typename T>
 class BufferTransform: public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const BufferType type,
       const int wino_blk_size,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -75,30 +73,26 @@ class BufferTransform: public OpenCLBufferTransformKernel {
 };
 
 template <typename T>
-MaceStatus BufferTransform<T>::Compute(OpKernelContext *context,
+MaceStatus BufferTransform<T>::Compute(OpContext *context,
                                        const Tensor *input,
                                        const BufferType type,
                                        const int wino_blk_size,
-                                       Tensor *output,
-                                       StatsFuture *future) {
+                                       Tensor *output) {
   MACE_UNUSED(type);
   MACE_UNUSED(wino_blk_size);
   const DataType dt = DataTypeToEnum<T>::value;
   switch (type) {
     case CONV2D_FILTER:
-      return TransformConv2DFilter(context, &kernel_, input,
-                                   dt, output, future);
+      return TransformConv2DFilter(context, &kernel_, input, dt, output);
     case DW_CONV2D_FILTER:
-      return TransformDWConv2DFilter(context, &kernel_, input,
-                                     dt, output, future);
+      return TransformDWConv2DFilter(context, &kernel_, input, dt, output);
     case ARGUMENT:
-      return TransformArgument(context, &kernel_, input, dt, output, future);
+      return TransformArgument(context, &kernel_, input, dt, output);
     default:
       if (input->dtype() != dt) {
-        return BufferTypeTransform(context, &kernel_, input,
-                                   dt, output, future);
+        return BufferTypeTransform(context, &kernel_, input, dt, output);
       } else {
-        SetFutureDefaultWaitFn(future);
+        SetFutureDefaultWaitFn(context->future());
         output->ReuseTensorBuffer(*input);
         return MaceStatus::MACE_SUCCESS;
       }
diff --git a/mace/kernels/opencl/buffer/buffer_type_transform.cc b/mace/kernels/opencl/buffer/buffer_type_transform.cc
index 8de6d6df..4f78f83a 100644
--- a/mace/kernels/opencl/buffer/buffer_type_transform.cc
+++ b/mace/kernels/opencl/buffer/buffer_type_transform.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/conv_2d.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -25,12 +24,11 @@ namespace buffer {
 
 
 MaceStatus BufferTypeTransform(
-    OpKernelContext *context,
+    OpContext *context,
     cl::Kernel *kernel,
     const Tensor *input,
     const DataType dt,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
   auto runtime = context->device()->opencl_runtime();
@@ -80,8 +78,8 @@ MaceStatus BufferTypeTransform(
   }
   MACE_CL_RET_STATUS(error);
   MACE_OUT_OF_RANGE_VALIDATION
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -90,7 +88,7 @@ MaceStatus BufferTypeTransform(
   }
   // Mark the buffer unused.
   const_cast<Tensor *>(input)->MarkUnused();
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace buffer
diff --git a/mace/kernels/opencl/buffer/conv_2d.h b/mace/kernels/opencl/buffer/conv_2d.h
index ba198320..8e7ee8b5 100644
--- a/mace/kernels/opencl/buffer/conv_2d.h
+++ b/mace/kernels/opencl/buffer/conv_2d.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
 #define MACE_KERNELS_OPENCL_BUFFER_CONV_2D_H_
 
-#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/conv_2d.h"
 
 #include <functional>
 #include <memory>
@@ -29,7 +29,7 @@ namespace opencl {
 namespace buffer {
 namespace conv2d {
 
-extern MaceStatus Conv2d1x1(OpKernelContext *context,
+extern MaceStatus Conv2d1x1(OpContext *context,
                             cl::Kernel *kernel,
                             const Tensor *padded_input,
                             const Tensor *filter,
@@ -42,7 +42,7 @@ extern MaceStatus Conv2d1x1(OpKernelContext *context,
                             Tensor *output,
                             StatsFuture *future);
 
-extern MaceStatus Conv2dGeneral(OpKernelContext *context,
+extern MaceStatus Conv2dGeneral(OpContext *context,
                                 cl::Kernel *kernel,
                                 const Tensor *input,
                                 const Tensor *filter,
@@ -63,7 +63,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
   Conv2dKernel() : old_scratch_size_(0) {}
 
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -73,8 +73,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   index_t old_scratch_size_;
@@ -85,7 +84,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
 
 template <typename T>
 MaceStatus Conv2dKernel<T>::Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -95,8 +94,7 @@ MaceStatus Conv2dKernel<T>::Compute(
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) {
+      Tensor *output) {
   StatsFuture pad_future, conv_future;
   index_t filter_h = filter->dim(2);
   index_t filter_w = filter->dim(3);
@@ -206,7 +204,7 @@ MaceStatus Conv2dKernel<T>::Compute(
     };
   }
   MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));
-  MergeMultipleFutureWaitFn({pad_future, conv_future}, future);
+  MergeMultipleFutureWaitFn({pad_future, conv_future}, context->future());
 
   return MaceStatus::MACE_SUCCESS;
 }
diff --git a/mace/kernels/opencl/buffer/conv_2d_1x1.cc b/mace/kernels/opencl/buffer/conv_2d_1x1.cc
index 97854cf4..cbe12466 100644
--- a/mace/kernels/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_1x1.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/activation.h"
-#include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -24,7 +23,7 @@ namespace opencl {
 namespace buffer {
 namespace conv2d {
 
-MaceStatus Conv2d1x1(OpKernelContext *context,
+MaceStatus Conv2d1x1(OpContext *context,
                      cl::Kernel *kernel,
                      const Tensor *padded_input,
                      const Tensor *filter,
@@ -117,7 +116,7 @@ MaceStatus Conv2d1x1(OpKernelContext *context,
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
                                            lws, future));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace conv2d
diff --git a/mace/kernels/opencl/buffer/conv_2d_general.cc b/mace/kernels/opencl/buffer/conv_2d_general.cc
index 67feecdf..17506a8b 100644
--- a/mace/kernels/opencl/buffer/conv_2d_general.cc
+++ b/mace/kernels/opencl/buffer/conv_2d_general.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/activation.h"
-#include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -24,7 +23,7 @@ namespace opencl {
 namespace buffer {
 namespace conv2d {
 
-MaceStatus Conv2dGeneral(OpKernelContext *context,
+MaceStatus Conv2dGeneral(OpContext *context,
                          cl::Kernel *kernel,
                          const Tensor *padded_input,
                          const Tensor *filter,
@@ -131,7 +130,7 @@ MaceStatus Conv2dGeneral(OpKernelContext *context,
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key, gws,
                                            lws, future));
   MACE_OUT_OF_RANGE_VALIDATION
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace conv2d
diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.cc b/mace/kernels/opencl/buffer/depthwise_conv2d.cc
index bcd36bba..9ff27690 100644
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.cc
@@ -23,7 +23,7 @@ namespace opencl {
 namespace buffer {
 namespace depthwise {
 
-MaceStatus DepthwiseConv2d(OpKernelContext *context,
+MaceStatus DepthwiseConv2d(OpContext *context,
                            cl::Kernel *kernel,
                            const Tensor *padded_input,   // NHWC
                            const Tensor *filter,  // HWIM
@@ -127,7 +127,7 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context,
                                            gws, lws, future));
 
   MACE_OUT_OF_RANGE_VALIDATION
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace depthwise
diff --git a/mace/kernels/opencl/buffer/depthwise_conv2d.h b/mace/kernels/opencl/buffer/depthwise_conv2d.h
index 23fddf0e..b5e26c40 100644
--- a/mace/kernels/opencl/buffer/depthwise_conv2d.h
+++ b/mace/kernels/opencl/buffer/depthwise_conv2d.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
 #define MACE_KERNELS_OPENCL_BUFFER_DEPTHWISE_CONV2D_H_
 
-#include "mace/kernels/depthwise_conv2d.h"
+#include "mace/kernels/opencl/depthwise_conv2d.h"
 
 #include <functional>
 #include <memory>
@@ -29,7 +29,7 @@ namespace opencl {
 namespace buffer {
 namespace depthwise {
 
-MaceStatus DepthwiseConv2d(OpKernelContext *context,
+MaceStatus DepthwiseConv2d(OpContext *context,
                            cl::Kernel *kernel,
                            const Tensor *padded_input,   // NHWC
                            const Tensor *filter,  // HWIM
@@ -50,7 +50,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
  public:
   DepthwiseConv2dKernel() : old_scratch_size_(0) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -60,8 +60,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   index_t old_scratch_size_;
@@ -72,7 +71,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
 
 template <typename T>
 MaceStatus DepthwiseConv2dKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const Tensor *filter,
     const Tensor *bias,
@@ -82,8 +81,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
     const int *dilations,
     const ActivationType activation,
     const float relux_max_limit,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   StatsFuture pad_future, dw_conv_future;
   index_t filter_w = filter->dim(3);
 
@@ -178,7 +176,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
           context, &kernels_[1], padded_input_ptr, filter, bias, strides,
           dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
           input_changed, output, &dw_conv_future));
-  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, future);
+  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
   return MaceStatus::MACE_SUCCESS;
 }
 
diff --git a/mace/kernels/opencl/buffer/pooling.h b/mace/kernels/opencl/buffer/pooling.h
index ef4ee447..a4433d13 100644
--- a/mace/kernels/opencl/buffer/pooling.h
+++ b/mace/kernels/opencl/buffer/pooling.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
 #define MACE_KERNELS_OPENCL_BUFFER_POOLING_H_
 
-#include "mace/kernels/pooling.h"
+#include "mace/kernels/opencl/pooling.h"
 
 #include <functional>
 #include <memory>
@@ -35,7 +35,7 @@ class PoolingKernel : public OpenCLPoolingKernel {
  public:
   PoolingKernel() : old_scratch_size_(0) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const PoolingType pooling_type,
       const int *kernels,
@@ -43,8 +43,7 @@ class PoolingKernel : public OpenCLPoolingKernel {
       const Padding &padding_type,
       const std::vector<int> &padding_data,
       const int *dilations,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   index_t old_scratch_size_;
@@ -55,7 +54,7 @@ class PoolingKernel : public OpenCLPoolingKernel {
 
 template <typename T>
 MaceStatus PoolingKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const PoolingType pooling_type,
     const int *kernels,
@@ -63,8 +62,7 @@ MaceStatus PoolingKernel<T>::Compute(
     const Padding &padding_type,
     const std::vector<int> &padding_data,
     const int *dilations,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
     << "Pooling opencl kernel not support dilation yet";
 
@@ -200,9 +198,9 @@ MaceStatus PoolingKernel<T>::Compute(
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, &pooling_future));
   MACE_OUT_OF_RANGE_VALIDATION
-  MergeMultipleFutureWaitFn({pad_future, pooling_future}, future);
+  MergeMultipleFutureWaitFn({pad_future, pooling_future}, context->future());
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace buffer
diff --git a/mace/kernels/opencl/buffer/softmax.h b/mace/kernels/opencl/buffer/softmax.h
index 59bb8d26..502899d8 100644
--- a/mace/kernels/opencl/buffer/softmax.h
+++ b/mace/kernels/opencl/buffer/softmax.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
 #define MACE_KERNELS_OPENCL_BUFFER_SOFTMAX_H_
 
-#include "mace/kernels/softmax.h"
+#include "mace/kernels/opencl/softmax.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -31,10 +33,9 @@ template <typename T>
 class SoftmaxKernel : public OpenCLSoftmaxKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *logits,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -44,10 +45,9 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
 
 template <typename T>
 MaceStatus SoftmaxKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *logits,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   index_t batch = 0;
   index_t height = 0;
   index_t width = 0;
@@ -112,9 +112,9 @@ MaceStatus SoftmaxKernel<T>::Compute(
   std::string tuning_key =
       Concat("softmax_opencl_kernel", batch, height, width, channels);
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace buffer
diff --git a/mace/kernels/opencl/buffer/utils.cc b/mace/kernels/opencl/buffer/utils.cc
index abc06ca8..a6d5502a 100644
--- a/mace/kernels/opencl/buffer/utils.cc
+++ b/mace/kernels/opencl/buffer/utils.cc
@@ -26,7 +26,7 @@ namespace kernels {
 namespace opencl {
 namespace buffer {
 
-MaceStatus PadInput(OpKernelContext *context,
+MaceStatus PadInput(OpContext *context,
                     cl::Kernel *kernel,
                     const Tensor *input,
                     const int pad_top,
@@ -88,7 +88,7 @@ MaceStatus PadInput(OpKernelContext *context,
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, *kernel, tuning_key,
                                            gws, lws, future));
   MACE_OUT_OF_RANGE_VALIDATION
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace buffer
diff --git a/mace/kernels/opencl/buffer/utils.h b/mace/kernels/opencl/buffer/utils.h
index f19a8210..f783691f 100644
--- a/mace/kernels/opencl/buffer/utils.h
+++ b/mace/kernels/opencl/buffer/utils.h
@@ -16,7 +16,7 @@
 #define MACE_KERNELS_OPENCL_BUFFER_UTILS_H_
 
 #include "mace/core/future.h"
-#include "mace/core/op_kernel_context.h"
+#include "mace/core/op_context.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
 
@@ -25,7 +25,7 @@ namespace kernels {
 namespace opencl {
 namespace buffer {
 
-MaceStatus PadInput(OpKernelContext *context,
+MaceStatus PadInput(OpContext *context,
                     cl::Kernel *kernel,
                     const Tensor *input,
                     const int pad_top,
diff --git a/mace/kernels/opencl/buffer_inverse_transform.cc b/mace/kernels/opencl/buffer_inverse_transform.cc
deleted file mode 100644
index 352fbed7..00000000
--- a/mace/kernels/opencl/buffer_inverse_transform.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/buffer_inverse_transform.h"
-#include "mace/kernels/opencl/image/image_to_buffer.h"
-#include "mace/kernels/opencl/buffer/buffer_inverse_transform.h"
-
-namespace mace {
-namespace kernels {
-
-template<typename T>
-BufferInverseTransformFunctor<
-    DeviceType::GPU, T>::BufferInverseTransformFunctor(
-    OpKernelContext *context,
-    const int wino_blk_size)
-  : BufferInverseTransformFunctorBase(context, wino_blk_size) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::ImageToBuffer<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::BufferInverseTransform<T>);
-  }
-}
-
-template <typename T>
-MaceStatus BufferInverseTransformFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const BufferType type,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, type,
-                          wino_blk_size_, output, future);
-}
-
-template struct BufferInverseTransformFunctor<DeviceType::GPU, float>;
-template struct BufferInverseTransformFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/buffer_inverse_transform.h b/mace/kernels/opencl/buffer_inverse_transform.h
new file mode 100644
index 00000000..0c785910
--- /dev/null
+++ b/mace/kernels/opencl/buffer_inverse_transform.h
@@ -0,0 +1,41 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
+
+#include "mace/kernels/opencl/common.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLBufferInverseTransformKernel {
+ public:
+  virtual MaceStatus Compute(OpContext *context,
+                             const Tensor *input,
+                             const BufferType type,
+                             const int wino_blk_size,
+                             Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferInverseTransformKernel)
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_INVERSE_TRANSFORM_H_
diff --git a/mace/kernels/opencl/buffer_transform.cc b/mace/kernels/opencl/buffer_transform.cc
deleted file mode 100644
index 55854753..00000000
--- a/mace/kernels/opencl/buffer_transform.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/buffer_transform.h"
-#include "mace/kernels/opencl/image/buffer_to_image.h"
-#include "mace/kernels/opencl/buffer/buffer_transform.h"
-
-namespace mace {
-namespace kernels {
-
-template<typename T>
-BufferTransformFunctor<DeviceType::GPU, T>::BufferTransformFunctor(
-    OpKernelContext *context,
-    const int wino_blk_size)
-  : BufferTransformFunctorBase(context, wino_blk_size) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::BufferToImage<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::BufferTransform<T>);
-  }
-}
-
-template <typename T>
-MaceStatus BufferTransformFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const BufferType type,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, type,
-                          wino_blk_size_, output, future);
-}
-
-template struct BufferTransformFunctor<DeviceType::GPU, float>;
-template struct BufferTransformFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/buffer_transform.h b/mace/kernels/opencl/buffer_transform.h
new file mode 100644
index 00000000..cc53ef77
--- /dev/null
+++ b/mace/kernels/opencl/buffer_transform.h
@@ -0,0 +1,41 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_
+
+#include "mace/kernels/opencl/common.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLBufferTransformKernel {
+ public:
+  virtual MaceStatus Compute(OpContext *context,
+                             const Tensor *input,
+                             const BufferType type,
+                             const int wino_blk_size,
+                             Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLBufferTransformKernel)
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_BUFFER_TRANSFORM_H_
diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc
deleted file mode 100644
index 7d836503..00000000
--- a/mace/kernels/opencl/channel_shuffle.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/channel_shuffle.h"
-#include "mace/kernels/opencl/image/channel_shuffle.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-ChannelShuffleFunctor<DeviceType::GPU, T>::ChannelShuffleFunctor(
-    OpKernelContext *context,
-    const int groups) : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::ChannelShuffleKernel<T>(groups));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus ChannelShuffleFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
-  return kernel_->Compute(context_, input, output, future);
-}
-
-template struct ChannelShuffleFunctor<DeviceType::GPU, float>;
-template struct ChannelShuffleFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/channel_shuffle.h b/mace/kernels/opencl/channel_shuffle.h
new file mode 100644
index 00000000..5a5da027
--- /dev/null
+++ b/mace/kernels/opencl/channel_shuffle.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_
+#define MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLChannelShuffleKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLChannelShuffleKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_CHANNEL_SHUFFLE_H_
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
deleted file mode 100644
index 12ba334f..00000000
--- a/mace/kernels/opencl/concat.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/concat.h"
-#include "mace/kernels/opencl/image/concat.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-ConcatFunctor<DeviceType::GPU, T>::ConcatFunctor(
-    OpKernelContext *context,
-    const int32_t axis)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::ConcatKernel<T>(axis));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus ConcatFunctor<DeviceType::GPU, T>::operator()(
-    const std::vector<const Tensor *> &input_list,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input_list, output, future);
-}
-
-template struct ConcatFunctor<DeviceType::GPU, float>;
-template struct ConcatFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/concat.h b/mace/kernels/opencl/concat.h
new file mode 100644
index 00000000..78ef14d9
--- /dev/null
+++ b/mace/kernels/opencl/concat.h
@@ -0,0 +1,41 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_CONCAT_H_
+#define MACE_KERNELS_OPENCL_CONCAT_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLConcatKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConcatKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_CONCAT_H_
diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc
deleted file mode 100644
index 38bb2e8f..00000000
--- a/mace/kernels/opencl/conv_2d.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/conv_2d.h"
-#include "mace/kernels/opencl/image/conv_2d.h"
-#include "mace/kernels/opencl/buffer/conv_2d.h"
-
-namespace mace {
-namespace kernels {
-
-template<typename T>
-Conv2dFunctor<DeviceType::GPU, T>::Conv2dFunctor(
-    OpKernelContext *context,
-    const int *strides,
-    const Padding &padding_type,
-    const std::vector<int> &paddings,
-    const int *dilations,
-    const ActivationType activation,
-    const float relux_max_limit)
-    : Conv2dFunctorBase(context,
-                        strides,
-                        padding_type,
-                        paddings,
-                        dilations,
-                        activation,
-                        relux_max_limit) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::Conv2dKernel<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::Conv2dKernel<T>);
-  }
-}
-
-template <typename T>
-MaceStatus Conv2dFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
-                                                         const Tensor *filter,
-                                                         const Tensor *bias,
-                                                         Tensor *output,
-                                                         StatsFuture *future) {
-  // Compute
-  return kernel_->Compute(context_, input, filter, bias,
-                          strides_, padding_type_, paddings_,
-                          dilations_, activation_, relux_max_limit_,
-                          output, future);
-}
-
-template struct Conv2dFunctor<DeviceType::GPU, float>;
-template struct Conv2dFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/conv_2d.h b/mace/kernels/opencl/conv_2d.h
new file mode 100644
index 00000000..d5ff40ed
--- /dev/null
+++ b/mace/kernels/opencl/conv_2d.h
@@ -0,0 +1,47 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_CONV_2D_H_
+#define MACE_KERNELS_OPENCL_CONV_2D_H_
+
+#include <vector>
+
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+
+namespace mace {
+class OpContext;
+
+namespace kernels {
+class OpenCLConv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConv2dKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_CONV_2D_H_
diff --git a/mace/kernels/opencl/crop.cc b/mace/kernels/opencl/crop.cc
deleted file mode 100644
index 720b2c8c..00000000
--- a/mace/kernels/opencl/crop.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/crop.h"
-#include "mace/kernels/opencl/image/crop.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-CropFunctor<DeviceType::GPU, T>::CropFunctor(OpKernelContext *context,
-                                             const int axis,
-                                             const std::vector<int> &offset)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::CropKernel<T>(axis, offset));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus CropFunctor<DeviceType::GPU, T>::operator()(
-    const std::vector<const Tensor *> &input_list,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input_list, output, future);
-}
-
-template struct CropFunctor<DeviceType::GPU, float>;
-template struct CropFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/crop.h b/mace/kernels/opencl/crop.h
new file mode 100644
index 00000000..d59f67f5
--- /dev/null
+++ b/mace/kernels/opencl/crop.h
@@ -0,0 +1,41 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_CROP_H_
+#define MACE_KERNELS_OPENCL_CROP_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLCropKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const std::vector<const Tensor *> &input_list,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLCropKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_CROP_H_
diff --git a/mace/kernels/opencl/deconv_2d.cc b/mace/kernels/opencl/deconv_2d.cc
deleted file mode 100644
index e449a2ef..00000000
--- a/mace/kernels/opencl/deconv_2d.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/deconv_2d.h"
-#include "mace/kernels/opencl/image/deconv_2d.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-Deconv2dFunctor<DeviceType::GPU, T>::Deconv2dFunctor(
-    OpKernelContext *context,
-    const std::vector<int> &strides,
-    const Padding &padding_type,
-    const std::vector<int> &paddings,
-    const FrameworkType model_type,
-    const ActivationType activation,
-    const float relux_max_limit)
-    : Deconv2dFunctorBase(context,
-                          strides,
-                          padding_type,
-                          paddings,
-                          model_type,
-                          activation,
-                          relux_max_limit) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::Deconv2dKernel<T>);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const Tensor *filter,
-    const Tensor *bias,
-    const Tensor *output_shape_tensor,
-    Tensor *output,
-    StatsFuture *future) {
-  MACE_CHECK_NOTNULL(input);
-  MACE_CHECK_NOTNULL(filter);
-  MACE_CHECK_NOTNULL(output);
-  std::vector<int> paddings(2);
-  std::vector<int> out_paddings(2);
-  std::vector<index_t> output_shape(4);
-  if (model_type_ == FrameworkType::TENSORFLOW) {
-    paddings = std::vector<int>(2, 0);
-    MACE_CHECK_NOTNULL(output_shape_tensor);
-    MACE_CHECK(output_shape_tensor->size() == 4);
-    Tensor::MappingGuard output_shape_mapper(output_shape_tensor);
-    auto output_shape_data =
-        output_shape_tensor->data<int32_t>();
-    output_shape =
-        std::vector<index_t>(output_shape_data, output_shape_data + 4);
-    CalcDeconvPaddingAndInputSize(input->shape().data(),
-                                  filter->shape().data(),
-                                  strides_.data(),
-                                  padding_type_,
-                                  output_shape.data(),
-                                  paddings.data());
-  } else {
-    out_paddings = paddings_;
-    paddings = std::vector<int>(2, 0);
-    output_shape = std::vector<index_t>(4, 0);
-    CalcDeconvOutputSize(input->shape().data(),
-                         filter->shape().data(),
-                         strides_.data(),
-                         output_shape.data(),
-                         out_paddings.data(),
-                         paddings.data());
-  }
-
-  return kernel_->Compute(context_, input, filter, bias,
-                          strides_.data(), paddings.data(), activation_,
-                          relux_max_limit_, output_shape, output, future);
-}
-
-template struct Deconv2dFunctor<DeviceType::GPU, float>;
-template struct Deconv2dFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/deconv_2d.h b/mace/kernels/opencl/deconv_2d.h
new file mode 100644
index 00000000..c601acfe
--- /dev/null
+++ b/mace/kernels/opencl/deconv_2d.h
@@ -0,0 +1,46 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_DECONV_2D_H_
+#define MACE_KERNELS_OPENCL_DECONV_2D_H_
+
+#include <vector>
+
+#include "mace/kernels/activation.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLDeconv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const int *padding_data,
+      const ActivationType activation,
+      const float relux_max_limit,
+      const std::vector<index_t> &output_shape,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDeconv2dKernel);
+};
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_DECONV_2D_H_
diff --git a/mace/kernels/opencl/depth_to_space.cc b/mace/kernels/opencl/depth_to_space.cc
deleted file mode 100644
index 2ab670d7..00000000
--- a/mace/kernels/opencl/depth_to_space.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/depth_to_space.h"
-#include "mace/kernels/opencl/image/depth_to_space.h"
-
-namespace mace {
-namespace kernels {
-template <typename T>
-DepthToSpaceOpFunctor<DeviceType::GPU, T>::DepthToSpaceOpFunctor(
-    OpKernelContext *context,
-    const int block_size)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::DepthToSpaceKernel<T>(block_size));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus DepthToSpaceOpFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
-  return kernel_->Compute(context_, input, output, future);
-}
-
-template struct DepthToSpaceOpFunctor<DeviceType::GPU, float>;
-template struct DepthToSpaceOpFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/depth_to_space.h b/mace/kernels/opencl/depth_to_space.h
new file mode 100644
index 00000000..02585911
--- /dev/null
+++ b/mace/kernels/opencl/depth_to_space.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_
+#define MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+
+class OpenCLDepthToSpaceKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthToSpaceKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_DEPTH_TO_SPACE_H_
diff --git a/mace/kernels/opencl/depthwise_conv2d.cc b/mace/kernels/opencl/depthwise_conv2d.cc
deleted file mode 100644
index 29f02876..00000000
--- a/mace/kernels/opencl/depthwise_conv2d.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/depthwise_conv2d.h"
-#include "mace/kernels/opencl/buffer/depthwise_conv2d.h"
-#include "mace/kernels/opencl/image/depthwise_conv2d.h"
-
-namespace mace {
-namespace kernels {
-template <typename T>
-DepthwiseConv2dFunctor<DeviceType::GPU, T>::DepthwiseConv2dFunctor(
-    OpKernelContext *context,
-    const int *strides,
-    const Padding padding_type,
-    const std::vector<int> &paddings,
-    const int *dilations,
-    const ActivationType activation,
-    const float relux_max_limit)
-    : DepthwiseConv2dFunctorBase(context,
-                                 strides,
-                                 padding_type,
-                                 paddings,
-                                 dilations,
-                                 activation,
-                                 relux_max_limit) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::DepthwiseConv2dKernel<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::DepthwiseConv2dKernel<T>);
-  }
-}
-
-template <typename T>
-MaceStatus DepthwiseConv2dFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const Tensor *filter, /* MIHW */
-    const Tensor *bias,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, filter, bias,
-                          strides_, padding_type_, paddings_,
-                          dilations_, activation_, relux_max_limit_,
-                          output, future);
-}
-
-template struct DepthwiseConv2dFunctor<DeviceType::GPU, float>;
-template struct DepthwiseConv2dFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/depthwise_conv2d.h b/mace/kernels/opencl/depthwise_conv2d.h
new file mode 100644
index 00000000..24d08a20
--- /dev/null
+++ b/mace/kernels/opencl/depthwise_conv2d.h
@@ -0,0 +1,48 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_
+#define MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_
+
+#include <vector>
+
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+
+namespace mace {
+
+class OpContext;
+
+namespace kernels {
+class OpenCLDepthwiseConv2dKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *filter,
+      const Tensor *bias,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_DEPTHWISE_CONV2D_H_
diff --git a/mace/kernels/opencl/eltwise.cc b/mace/kernels/opencl/eltwise.cc
deleted file mode 100644
index e43e2198..00000000
--- a/mace/kernels/opencl/eltwise.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/eltwise.h"
-#include "mace/kernels/opencl/image/eltwise.h"
-
-namespace mace {
-namespace kernels {
-template <typename T>
-EltwiseFunctor<DeviceType::GPU, T>::EltwiseFunctor(
-    OpKernelContext *context,
-    const EltwiseType type,
-    const std::vector<float> &coeff,
-    const float scalar_input,
-    const int32_t scalar_input_index,
-    const DataFormat data_format) : OpKernel(context) {
-  MACE_UNUSED(data_format);
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::EltwiseKernel<T>(
-        type, coeff, scalar_input, scalar_input_index));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
-                                                          const Tensor *input1,
-                                                          Tensor *output,
-                                                          StatsFuture *future) {
-  return kernel_->Compute(context_, input0, input1, output, future);
-}
-
-template struct EltwiseFunctor<DeviceType::GPU, float>;
-template struct EltwiseFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/eltwise.h b/mace/kernels/opencl/eltwise.h
new file mode 100644
index 00000000..83a94feb
--- /dev/null
+++ b/mace/kernels/opencl/eltwise.h
@@ -0,0 +1,40 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_ELTWISE_H_
+#define MACE_KERNELS_OPENCL_ELTWISE_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLEltwiseKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input0,
+      const Tensor *input1,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLEltwiseKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_ELTWISE_H_
diff --git a/mace/kernels/opencl/fully_connected.cc b/mace/kernels/opencl/fully_connected.cc
deleted file mode 100644
index 3dd0db4c..00000000
--- a/mace/kernels/opencl/fully_connected.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/fully_connected.h"
-#include "mace/kernels/opencl/image/fully_connected.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-FullyConnectedFunctor<DeviceType::GPU, T>::FullyConnectedFunctor(
-    OpKernelContext *context,
-    const ActivationType activation,
-    const float relux_max_limit)
-    : FullyConnectedBase(context, activation, relux_max_limit) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::FullyConnectedKernel<T>);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-template <typename T>
-MaceStatus FullyConnectedFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const Tensor *weight,
-    const Tensor *bias,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(
-      context_, input, weight, bias, activation_, relux_max_limit_,
-      output, future);
-}
-
-template struct FullyConnectedFunctor<DeviceType::GPU, float>;
-
-template struct FullyConnectedFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/fully_connected.h b/mace/kernels/opencl/fully_connected.h
new file mode 100644
index 00000000..7982d468
--- /dev/null
+++ b/mace/kernels/opencl/fully_connected.h
@@ -0,0 +1,45 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_
+#define MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_
+
+#include "mace/kernels/activation.h"
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLFullyConnectedKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *weight,
+      const Tensor *bias,
+      const ActivationType activation,
+      const float relux_max_limit,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLFullyConnectedKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_FULLY_CONNECTED_H_
diff --git a/mace/kernels/opencl/image/activation.h b/mace/kernels/opencl/image/activation.h
index 5ddf00ac..b1633076 100644
--- a/mace/kernels/opencl/image/activation.h
+++ b/mace/kernels/opencl/image/activation.h
@@ -14,13 +14,16 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
 #define MACE_KERNELS_OPENCL_IMAGE_ACTIVATION_H_
 
-#include "mace/kernels/activation.h"
+#include "mace/kernels/opencl/activation.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -36,11 +39,10 @@ class ActivationKernel : public OpenCLActivationKernel {
       : activation_(type), relux_max_limit_(relux_max_limit) {}
 
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *alpha,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   ActivationType activation_;
@@ -53,11 +55,10 @@ class ActivationKernel : public OpenCLActivationKernel {
 
 template <typename T>
 MaceStatus ActivationKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const Tensor *alpha,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t batch = input->dim(0);
   const index_t height = input->dim(1);
   const index_t width = input->dim(2);
@@ -133,10 +134,10 @@ MaceStatus ActivationKernel<T>::Compute(
       Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2),
              output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/addn.h b/mace/kernels/opencl/image/addn.h
index 49721c09..8f50d140 100644
--- a/mace/kernels/opencl/image/addn.h
+++ b/mace/kernels/opencl/image/addn.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
 #define MACE_KERNELS_OPENCL_IMAGE_ADDN_H_
 
-#include "mace/kernels/addn.h"
+#include "mace/kernels/opencl/addn.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,10 +34,9 @@ template <typename T>
 class AddNKernel : public OpenCLAddNKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const std::vector<const Tensor *> &input_tensors,
-      Tensor *output_tensor,
-      StatsFuture *future) override;
+      Tensor *output_tensor) override;
 
  private:
   cl::Kernel kernel_;
@@ -45,10 +46,9 @@ class AddNKernel : public OpenCLAddNKernel {
 
 template <typename T>
 MaceStatus AddNKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const std::vector<const Tensor *> &input_tensors,
-    Tensor *output_tensor,
-    StatsFuture *future) {
+    Tensor *output_tensor) {
   size_t size = input_tensors.size();
   MACE_CHECK(size >= 2 && input_tensors[0] != nullptr);
 
@@ -122,9 +122,9 @@ MaceStatus AddNKernel<T>::Compute(
       Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1),
              output_tensor->dim(2), output_tensor->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/batch_norm.h b/mace/kernels/opencl/image/batch_norm.h
index 7b160154..9414f28b 100644
--- a/mace/kernels/opencl/image/batch_norm.h
+++ b/mace/kernels/opencl/image/batch_norm.h
@@ -14,13 +14,16 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
 #define MACE_KERNELS_OPENCL_IMAGE_BATCH_NORM_H_
 
-#include "mace/kernels/batch_norm.h"
+#include "mace/kernels/opencl/batch_norm.h"
 
 #include <memory>
 #include <vector>
 #include <set>
 #include <string>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,21 +35,19 @@ template <typename T>
 class BatchNormKernel : public OpenCLBatchNormKernel {
  public:
   BatchNormKernel(
-      const bool folded_constant,
+      const float epsilon,
       const ActivationType activation,
       const float relux_max_limit);
-  MaceStatus Compute(OpKernelContext *context,
+  MaceStatus Compute(OpContext *context,
                      const Tensor *input,
                      const Tensor *scale,
                      const Tensor *offset,
                      const Tensor *mean,
                      const Tensor *var,
-                     const float epsilon,
-                     Tensor *output,
-                     StatsFuture *future) override;
+                     Tensor *output) override;
 
  private:
-  const bool folded_constant_;
+  const float epsilon_;
   const ActivationType activation_;
   const float relux_max_limit_;
   cl::Kernel kernel_;
@@ -55,25 +56,23 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
 };
 
 template <typename T>
-BatchNormKernel<T>::BatchNormKernel(const bool folded_constant,
+BatchNormKernel<T>::BatchNormKernel(const float epsilon,
                                     const ActivationType activation,
                                     const float relux_max_limit)
-    : folded_constant_(folded_constant),
+    : epsilon_(epsilon),
       activation_(activation),
       relux_max_limit_(relux_max_limit) {}
 
 template <typename T>
 MaceStatus BatchNormKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const Tensor *scale,
     const Tensor *offset,
     const Tensor *mean,
     const Tensor *var,
-    const float epsilon,
-    Tensor *output,
-    StatsFuture *future) {
-  MACE_CHECK(folded_constant_ || (mean != nullptr && var != nullptr));
+    Tensor *output) {
+  bool not_folded = (mean != nullptr && var != nullptr);
 
   const index_t batch = input->dim(0);
   const index_t height = input->dim(1);
@@ -98,7 +97,7 @@ MaceStatus BatchNormKernel<T>::Compute(
     built_options.emplace("-Dbatch_norm=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    if (folded_constant_) {
+    if (!not_folded) {
       built_options.emplace("-DFOLDED_CONSTANT");
     }
     switch (activation_) {
@@ -134,10 +133,10 @@ MaceStatus BatchNormKernel<T>::Compute(
     kernel_.setArg(idx++, *(input->opencl_image()));
     kernel_.setArg(idx++, *(scale->opencl_image()));
     kernel_.setArg(idx++, *(offset->opencl_image()));
-    if (!folded_constant_) {
+    if (not_folded) {
       kernel_.setArg(idx++, *(mean->opencl_image()));
       kernel_.setArg(idx++, *(var->opencl_image()));
-      kernel_.setArg(idx++, epsilon);
+      kernel_.setArg(idx++, epsilon_);
     }
     kernel_.setArg(idx++, *(output->opencl_image()));
     kernel_.setArg(idx++, relux_max_limit_);
@@ -148,11 +147,11 @@ MaceStatus BatchNormKernel<T>::Compute(
   const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   std::string tuning_key =
       Concat("batch_norm_opencl_kernel", activation_, output->dim(0),
-             output->dim(1), output->dim(2), output->dim(3), folded_constant_);
+             output->dim(1), output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/batch_to_space.h b/mace/kernels/opencl/image/batch_to_space.h
index f3c4bf8c..8d984270 100644
--- a/mace/kernels/opencl/image/batch_to_space.h
+++ b/mace/kernels/opencl/image/batch_to_space.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
 #define MACE_KERNELS_OPENCL_IMAGE_BATCH_TO_SPACE_H_
 
-#include "mace/kernels/batch_to_space.h"
+#include "mace/kernels/opencl/batch_to_space.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,13 +34,12 @@ template <typename T>
 class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *batch_tensor,
       const std::vector<int> &paddings,
       const std::vector<int> &block_shape,
       const std::vector<index_t> &output_shape,
-      Tensor *space_tensor,
-      StatsFuture *future) override;
+      Tensor *space_tensor) override;
 
  private:
   cl::Kernel kernel_;
@@ -48,13 +49,12 @@ class BatchToSpaceKernel : public OpenCLBatchToSpaceKernel {
 
 template <typename T>
 MaceStatus BatchToSpaceKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *batch_tensor,
     const std::vector<int> &paddings,
     const std::vector<int> &block_shape,
     const std::vector<index_t> &output_shape,
-    Tensor *space_tensor,
-    StatsFuture *future) {
+    Tensor *space_tensor) {
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
@@ -116,10 +116,10 @@ MaceStatus BatchToSpaceKernel<T>::Compute(
       Concat("batch_to_space", batch_tensor->dim(0), batch_tensor->dim(1),
              batch_tensor->dim(2), batch_tensor->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/bias_add.h b/mace/kernels/opencl/image/bias_add.h
index 3a84cbce..2180df11 100644
--- a/mace/kernels/opencl/image/bias_add.h
+++ b/mace/kernels/opencl/image/bias_add.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
 #define MACE_KERNELS_OPENCL_IMAGE_BIAS_ADD_H_
 
-#include "mace/kernels/bias_add.h"
+#include "mace/kernels/opencl/bias_add.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,11 +34,10 @@ template <typename T>
 class BiasAddKernel : public OpenCLBiasAddKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *bias,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -46,11 +47,10 @@ class BiasAddKernel : public OpenCLBiasAddKernel {
 
 template <typename T>
 MaceStatus BiasAddKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const Tensor *bias,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t batch = input->dim(0);
   const index_t height = input->dim(1);
   const index_t width = input->dim(2);
@@ -111,8 +111,8 @@ MaceStatus BiasAddKernel<T>::Compute(
   }
   MACE_CL_RET_STATUS(error);
   MACE_OUT_OF_RANGE_VALIDATION;
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -120,7 +120,7 @@ MaceStatus BiasAddKernel<T>::Compute(
     };
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/buffer_to_image.h b/mace/kernels/opencl/image/buffer_to_image.h
index a791c064..208c33fa 100644
--- a/mace/kernels/opencl/image/buffer_to_image.h
+++ b/mace/kernels/opencl/image/buffer_to_image.h
@@ -15,11 +15,14 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
 #define MACE_KERNELS_OPENCL_IMAGE_BUFFER_TO_IMAGE_H_
 
+#include "mace/kernels/opencl/buffer_transform.h"
+
 #include <set>
 #include <string>
 #include <vector>
 
-#include "mace/kernels/buffer_transform.h"
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -31,12 +34,11 @@ template <typename T>
 class BufferToImage : public OpenCLBufferTransformKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const BufferType type,
       const int wino_blk_size,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -45,12 +47,11 @@ class BufferToImage : public OpenCLBufferTransformKernel {
 
 template <typename T>
 MaceStatus BufferToImage<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const BufferType type,
     const int wino_blk_size,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
   std::vector<size_t> image_shape;
   CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
@@ -186,8 +187,8 @@ MaceStatus BufferToImage<T>::Compute(
   }
   MACE_CL_RET_STATUS(error);
   MACE_OUT_OF_RANGE_VALIDATION;
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -198,7 +199,7 @@ MaceStatus BufferToImage<T>::Compute(
   // Mark the buffer unused.
   const_cast<Tensor *>(input)->MarkUnused();
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/channel_shuffle.h b/mace/kernels/opencl/image/channel_shuffle.h
index 5034f569..8d351c0a 100644
--- a/mace/kernels/opencl/image/channel_shuffle.h
+++ b/mace/kernels/opencl/image/channel_shuffle.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
 #define MACE_KERNELS_OPENCL_IMAGE_CHANNEL_SHUFFLE_H_
 
-#include "mace/kernels/channel_shuffle.h"
+#include "mace/kernels/opencl/channel_shuffle.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -33,10 +35,9 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
  public:
   explicit ChannelShuffleKernel(const int groups) : groups_(groups) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   const int groups_;
@@ -47,10 +48,12 @@ class ChannelShuffleKernel : public OpenCLChannelShuffleKernel {
 
 template <typename T>
 MaceStatus ChannelShuffleKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
+  MACE_CHECK(input->dim(3) % groups_ == 0,
+             "input channels must be an integral multiple of group. ",
+             input->dim(3));
   MACE_RETURN_IF_ERROR(output->ResizeLike(input));
 
   const index_t batch = input->dim(0);
@@ -105,9 +108,9 @@ MaceStatus ChannelShuffleKernel<T>::Compute(
       Concat("channel_shuffle_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/concat.cc b/mace/kernels/opencl/image/concat.cc
index 60144d55..9fc6dd48 100644
--- a/mace/kernels/opencl/image/concat.cc
+++ b/mace/kernels/opencl/image/concat.cc
@@ -46,14 +46,13 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 }  // namespace
 
 
-MaceStatus Concat2(OpKernelContext *context,
+MaceStatus Concat2(OpContext *context,
                    cl::Kernel *kernel,
                    const Tensor *input0,
                    const Tensor *input1,
                    const DataType dt,
                    std::vector<index_t> *prev_input_shape,
                    Tensor *output,
-                   StatsFuture *future,
                    uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -112,17 +111,16 @@ MaceStatus Concat2(OpKernelContext *context,
       Concat("concat_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
-MaceStatus ConcatN(OpKernelContext *context,
+MaceStatus ConcatN(OpContext *context,
                    cl::Kernel *kernel,
                    const std::vector<const Tensor *> &input_list,
                    const DataType dt,
                    Tensor *output,
-                   StatsFuture *future,
                    uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -185,7 +183,7 @@ MaceStatus ConcatN(OpKernelContext *context,
     }
     MACE_CL_RET_STATUS(error);
     MACE_OUT_OF_RANGE_VALIDATION;
-    if (future != nullptr && runtime->is_profiling_enabled()) {
+    if (context->future() != nullptr && runtime->is_profiling_enabled()) {
       event.wait();
       CallStats tmp_stats;
       runtime->GetCallStats(event, &tmp_stats);
@@ -194,8 +192,8 @@ MaceStatus ConcatN(OpKernelContext *context,
       call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
     }
   }
-  if (future != nullptr) {
-    future->wait_fn = [call_stats](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [call_stats](CallStats *stats) {
       if (stats != nullptr) {
         stats->start_micros = call_stats.start_micros;
         stats->end_micros = stats->start_micros + call_stats.end_micros;
@@ -203,7 +201,7 @@ MaceStatus ConcatN(OpKernelContext *context,
     };
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace concat
diff --git a/mace/kernels/opencl/image/concat.h b/mace/kernels/opencl/image/concat.h
index 6289a000..4041cc3e 100644
--- a/mace/kernels/opencl/image/concat.h
+++ b/mace/kernels/opencl/image/concat.h
@@ -14,11 +14,13 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
 #define MACE_KERNELS_OPENCL_IMAGE_CONCAT_H_
 
-#include "mace/kernels/concat.h"
+#include "mace/kernels/opencl/concat.h"
 
 #include <memory>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -26,22 +28,20 @@ namespace kernels {
 namespace opencl {
 namespace image {
 namespace concat {
-MaceStatus Concat2(OpKernelContext *context,
+MaceStatus Concat2(OpContext *context,
                    cl::Kernel *kernel,
                    const Tensor *input0,
                    const Tensor *input1,
                    const DataType dt,
                    std::vector<index_t> *prev_input_shape,
                    Tensor *output,
-                   StatsFuture *future,
                    uint32_t *kwg_size);
 
-MaceStatus ConcatN(OpKernelContext *context,
+MaceStatus ConcatN(OpContext *context,
                    cl::Kernel *kernel,
                    const std::vector<const Tensor *> &input_list,
                    const DataType dt,
                    Tensor *output,
-                   StatsFuture *future,
                    uint32_t *kwg_size);
 }  // namespace concat
 
@@ -50,10 +50,9 @@ class ConcatKernel : public OpenCLConcatKernel {
  public:
   explicit ConcatKernel(const int32_t axis) : axis_(axis) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const std::vector<const Tensor *> &input_list,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   int32_t axis_;
@@ -64,10 +63,9 @@ class ConcatKernel : public OpenCLConcatKernel {
 
 template <typename T>
 MaceStatus ConcatKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const std::vector<const Tensor *> &input_list,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const int inputs_count = input_list.size();
   MACE_CHECK(inputs_count >= 2 && axis_ == 3)
     << "Concat opencl kernel only support >=2 elements with axis == 3";
@@ -101,18 +99,17 @@ MaceStatus ConcatKernel<T>::Compute(
     case 2:
       return concat::Concat2(
           context, &kernel_, input_list[0], input_list[1],
-          DataTypeToEnum<T>::value, &input_shape_, output, future, &kwg_size_);
+          DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
     default:
       if (divisible_four) {
         return concat::ConcatN(context, &kernel_, input_list,
-                               DataTypeToEnum<T>::value, output, future,
-                               &kwg_size_);
+                               DataTypeToEnum<T>::value, output, &kwg_size_);
       } else {
         MACE_NOT_IMPLEMENTED;
       }
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/conv_2d.h b/mace/kernels/opencl/image/conv_2d.h
index 05ee6a07..415beac4 100644
--- a/mace/kernels/opencl/image/conv_2d.h
+++ b/mace/kernels/opencl/image/conv_2d.h
@@ -14,11 +14,13 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
 #define MACE_KERNELS_OPENCL_IMAGE_CONV_2D_H_
 
-#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/opencl/conv_2d.h"
 
 #include <memory>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -26,7 +28,7 @@ namespace kernels {
 namespace opencl {
 namespace image {
 
-extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
+extern MaceStatus Conv2dOpenclK1x1(OpContext *context,
                                    cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
@@ -39,10 +41,9 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
                                    const DataType dt,
                                    std::vector<index_t> *prev_input_shape,
                                    Tensor *output,
-                                   StatsFuture *future,
                                    uint32_t *kwg_size);
 
-extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
+extern MaceStatus Conv2dOpenclK3x3(OpContext *context,
                                    cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
@@ -55,10 +56,9 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
                                    const DataType dt,
                                    std::vector<index_t> *prev_input_shape,
                                    Tensor *output,
-                                   StatsFuture *future,
                                    uint32_t *kwg_size);
 
-extern MaceStatus Conv2dOpencl(OpKernelContext *context,
+extern MaceStatus Conv2dOpencl(OpContext *context,
                                cl::Kernel *kernel,
                                const Tensor *input,
                                const Tensor *filter,
@@ -71,7 +71,6 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
                                const DataType dt,
                                std::vector<index_t> *prev_input_shape,
                                Tensor *output,
-                               StatsFuture *future,
                                uint32_t *kwg_size);
 
 
@@ -79,7 +78,7 @@ template <typename T>
 class Conv2dKernel : public OpenCLConv2dKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -89,8 +88,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -100,7 +98,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
 
 template <typename T>
 MaceStatus Conv2dKernel<T>::Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -110,15 +108,14 @@ MaceStatus Conv2dKernel<T>::Compute(
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) {
+      Tensor *output) {
   typedef MaceStatus (*Conv2dOpenclFunction)(
-      OpKernelContext *context,
-      cl::Kernel * kernel, const Tensor *input, const Tensor *filter,
+      OpContext *context,
+      cl::Kernel *kernel, const Tensor *input, const Tensor *filter,
       const Tensor *bias, const int stride, const int *padding,
       const int *dilations, const ActivationType activation,
       const float relux_max_limit, const DataType dt,
-      std::vector<index_t> *input_shape, Tensor *output, StatsFuture *future,
+      std::vector<index_t> *input_shape, Tensor *output,
       uint32_t *kwg_size);
   // Selection matrix: kernel_size x stride_size
   static const Conv2dOpenclFunction selector[3] = {
@@ -161,13 +158,13 @@ MaceStatus Conv2dKernel<T>::Compute(
     return conv2d_func(context,
         &kernel_, input, filter, bias, strides[0], paddings.data(), dilations,
         activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
-        output, future, &kwg_size_);
+        output, &kwg_size_);
   } else {
     return Conv2dOpencl(
         context, &kernel_, input, filter, bias,
         strides[0], paddings.data(), dilations,
         activation, relux_max_limit, DataTypeToEnum<T>::value, &input_shape_,
-        output, future, &kwg_size_);
+        output, &kwg_size_);
   }
 }
 
diff --git a/mace/kernels/opencl/image/conv_2d_1x1.cc b/mace/kernels/opencl/image/conv_2d_1x1.cc
index 2460afe5..36f8ba34 100644
--- a/mace/kernels/opencl/image/conv_2d_1x1.cc
+++ b/mace/kernels/opencl/image/conv_2d_1x1.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/conv_2d.h"
+#include "mace/kernels/activation.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 
 namespace mace {
 namespace kernels {
@@ -66,7 +66,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
+extern MaceStatus Conv2dOpenclK1x1(OpContext *context,
                                    cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
@@ -79,7 +79,6 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
                                    const DataType dt,
                                    std::vector<index_t> *prev_input_shape,
                                    Tensor *output,
-                                   StatsFuture *future,
                                    uint32_t *kwg_size) {
   MACE_UNUSED(padding);
   MACE_UNUSED(dilations);
@@ -170,9 +169,9 @@ extern MaceStatus Conv2dOpenclK1x1(OpKernelContext *context,
       Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/conv_2d_3x3.cc b/mace/kernels/opencl/image/conv_2d_3x3.cc
index 900cd610..f2f94c03 100644
--- a/mace/kernels/opencl/image/conv_2d_3x3.cc
+++ b/mace/kernels/opencl/image/conv_2d_3x3.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/activation.h"
-#include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
@@ -60,7 +59,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
+extern MaceStatus Conv2dOpenclK3x3(OpContext *context,
                                    cl::Kernel *kernel,
                                    const Tensor *input,
                                    const Tensor *filter,
@@ -73,7 +72,6 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
                                    const DataType dt,
                                    std::vector<index_t> *prev_input_shape,
                                    Tensor *output,
-                                   StatsFuture *future,
                                    uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -158,9 +156,9 @@ extern MaceStatus Conv2dOpenclK3x3(OpKernelContext *context,
       Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/conv_2d_general.cc b/mace/kernels/opencl/image/conv_2d_general.cc
index 0286edf7..8221814e 100644
--- a/mace/kernels/opencl/image/conv_2d_general.cc
+++ b/mace/kernels/opencl/image/conv_2d_general.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/conv_2d.h"
 #include "mace/kernels/opencl/helper.h"
-#include "mace/utils/tuner.h"
+#include "mace/kernels/activation.h"
 #include "mace/utils/utils.h"
 
 namespace mace {
@@ -68,7 +67,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-extern MaceStatus Conv2dOpencl(OpKernelContext *context,
+extern MaceStatus Conv2dOpencl(OpContext *context,
                                cl::Kernel *kernel,
                                const Tensor *input,
                                const Tensor *filter,
@@ -81,7 +80,6 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
                                const DataType dt,
                                std::vector<index_t> *prev_input_shape,
                                Tensor *output,
-                               StatsFuture *future,
                                uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -170,10 +168,10 @@ extern MaceStatus Conv2dOpencl(OpKernelContext *context,
   std::vector<uint32_t> lws =
       LocalWS(runtime, gws, filter->dim(2) * filter->dim(3), *kwg_size);
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/crop.h b/mace/kernels/opencl/image/crop.h
index 10aa6ecb..7ab8ce1c 100644
--- a/mace/kernels/opencl/image/crop.h
+++ b/mace/kernels/opencl/image/crop.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_CROP_H_
 #define MACE_KERNELS_OPENCL_IMAGE_CROP_H_
 
-#include "mace/kernels/crop.h"
+#include "mace/kernels/opencl/crop.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -36,10 +38,9 @@ class CropKernel : public OpenCLCropKernel {
       const std::vector<int> &offset)
       : axis_(axis), offset_(offset) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const std::vector<const Tensor *> &input_list,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   const int axis_;
@@ -51,10 +52,9 @@ class CropKernel : public OpenCLCropKernel {
 
 template <typename T>
 MaceStatus CropKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const std::vector<const Tensor *> &input_list,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const int32_t inputs_count = static_cast<int32_t>(input_list.size());
   MACE_CHECK(inputs_count >= 2)
     << "Crop opencl kernel only support 2 elements input";
@@ -181,9 +181,9 @@ MaceStatus CropKernel<T>::Compute(
       Concat("crop_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/deconv_2d.h b/mace/kernels/opencl/image/deconv_2d.h
index f1ce71c8..eae5978a 100644
--- a/mace/kernels/opencl/image/deconv_2d.h
+++ b/mace/kernels/opencl/image/deconv_2d.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
 #define MACE_KERNELS_OPENCL_IMAGE_DECONV_2D_H_
 
-#include "mace/kernels/deconv_2d.h"
+#include "mace/kernels/opencl/deconv_2d.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,7 +34,7 @@ template <typename T>
 class Deconv2dKernel : public OpenCLDeconv2dKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -41,8 +43,7 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel {
       const ActivationType activation,
       const float relux_max_limit,
       const std::vector<index_t> &output_shape,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -52,7 +53,7 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel {
 
 template <typename T>
 MaceStatus Deconv2dKernel<T>::Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -61,8 +62,7 @@ MaceStatus Deconv2dKernel<T>::Compute(
       const ActivationType activation,
       const float relux_max_limit,
       const std::vector<index_t> &output_shape,
-      Tensor *output,
-      StatsFuture *future) {
+      Tensor *output) {
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
@@ -174,10 +174,10 @@ MaceStatus Deconv2dKernel<T>::Compute(
       Concat("deconv2d_opencl_kernel_", activation, output->dim(0),
              output->dim(1), output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/depth_to_space.h b/mace/kernels/opencl/image/depth_to_space.h
index 280cdaa6..0a961d53 100644
--- a/mace/kernels/opencl/image/depth_to_space.h
+++ b/mace/kernels/opencl/image/depth_to_space.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
 #define MACE_KERNELS_OPENCL_IMAGE_DEPTH_TO_SPACE_H_
 
-#include "mace/kernels/depth_to_space.h"
+#include "mace/kernels/opencl/depth_to_space.h"
 
 #include <memory>
 #include <vector>
 #include <set>
 #include <string>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -34,10 +36,9 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
   explicit DepthToSpaceKernel(const int block_size)
       : block_size_(block_size) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   const int block_size_;
@@ -48,10 +49,9 @@ class DepthToSpaceKernel : public OpenCLDepthToSpaceKernel {
 
 template <typename T>
 MaceStatus DepthToSpaceKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t batch = input->dim(0);
   const index_t input_height = input->dim(1);
   const index_t input_width = input->dim(2);
@@ -130,10 +130,10 @@ MaceStatus DepthToSpaceKernel<T>::Compute(
                                    output_width, output_depth);
   const std::vector<uint32_t> lws = Default3DLocalWS(runtime, gws, kwg_size_);
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/depthwise_conv2d.cc b/mace/kernels/opencl/image/depthwise_conv2d.cc
index 00f0102e..57953960 100644
--- a/mace/kernels/opencl/image/depthwise_conv2d.cc
+++ b/mace/kernels/opencl/image/depthwise_conv2d.cc
@@ -63,7 +63,7 @@ std::vector<uint32_t> LocalWS(OpenCLRuntime *runtime,
 
 }  // namespace
 
-MaceStatus DepthwiseConv2d(OpKernelContext *context,
+MaceStatus DepthwiseConv2d(OpContext *context,
                            cl::Kernel *kernel,
                            const Tensor *input,   // NHWC
                            const Tensor *filter,  // HWIM
@@ -76,7 +76,6 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context,
                            const DataType dt,
                            std::vector<index_t> *prev_input_shape,
                            Tensor *output,
-                           StatsFuture *future,
                            uint32_t *kwg_size) {
   const index_t batch = output->dim(0);
   const index_t height = output->dim(1);
@@ -181,10 +180,10 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context,
   std::string tuning_key =
       Concat("depthwise_conv2d_ocl_kernel", gws[0], gws[1], gws[2], multiplier);
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, *kernel, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace depthwise
diff --git a/mace/kernels/opencl/image/depthwise_conv2d.h b/mace/kernels/opencl/image/depthwise_conv2d.h
index 8b5568f5..7bfa9ede 100644
--- a/mace/kernels/opencl/image/depthwise_conv2d.h
+++ b/mace/kernels/opencl/image/depthwise_conv2d.h
@@ -14,11 +14,13 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
 #define MACE_KERNELS_OPENCL_IMAGE_DEPTHWISE_CONV2D_H_
 
-#include "mace/kernels/depthwise_conv2d.h"
+#include "mace/kernels/opencl/depthwise_conv2d.h"
 
 #include <memory>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -27,7 +29,7 @@ namespace opencl {
 namespace image {
 namespace depthwise {
 
-MaceStatus DepthwiseConv2d(OpKernelContext *context,
+MaceStatus DepthwiseConv2d(OpContext *context,
                            cl::Kernel *kernel,
                            const Tensor *input,   // NHWC
                            const Tensor *filter,  // HWIM
@@ -40,7 +42,6 @@ MaceStatus DepthwiseConv2d(OpKernelContext *context,
                            const DataType dt,
                            std::vector<index_t> *prev_input_shape,
                            Tensor *output,
-                           StatsFuture *future,
                            uint32_t *kwg_size);
 }  // namespace depthwise
 
@@ -49,7 +50,7 @@ template <typename T>
 class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *filter,
       const Tensor *bias,
@@ -59,8 +60,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
       const int *dilations,
       const ActivationType activation,
       const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -70,7 +70,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
 
 template <typename T>
 MaceStatus DepthwiseConv2dKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const Tensor *filter,
     const Tensor *bias,
@@ -80,8 +80,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
     const int *dilations,
     const ActivationType activation,
     const float relux_max_limit,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   index_t kernel_h = filter->dim(2);
   index_t kernel_w = filter->dim(3);
   if (strides[0] != strides[1]) {
@@ -120,7 +119,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
   return depthwise::DepthwiseConv2d(
       context, &kernel_, input, filter, bias, strides[0], paddings.data(),
       dilations, activation, relux_max_limit, DataTypeToEnum<T>::value,
-      &input_shape_, output, future, &kwg_size_);
+      &input_shape_, output, &kwg_size_);
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/eltwise.h b/mace/kernels/opencl/image/eltwise.h
index c2bbc3a5..d2352602 100644
--- a/mace/kernels/opencl/image/eltwise.h
+++ b/mace/kernels/opencl/image/eltwise.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
 #define MACE_KERNELS_OPENCL_IMAGE_ELTWISE_H_
 
-#include "mace/kernels/eltwise.h"
+#include "mace/kernels/opencl/eltwise.h"
 
 #include <memory>
 #include <utility>
@@ -22,6 +22,9 @@
 #include <set>
 #include <string>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/eltwise.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -42,11 +45,10 @@ class EltwiseKernel : public OpenCLEltwiseKernel {
         scalar_input_(scalar_input),
         scalar_input_index_(scalar_input_index) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input0,
       const Tensor *input1,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   EltwiseType type_;
@@ -60,11 +62,10 @@ class EltwiseKernel : public OpenCLEltwiseKernel {
 
 template <typename T>
 MaceStatus EltwiseKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input0,
     const Tensor *input1,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   bool swapped = false;
   if (input1 != nullptr) {
     MACE_CHECK(input0->dim_size() == input1->dim_size() ||
@@ -177,9 +178,9 @@ MaceStatus EltwiseKernel<T>::Compute(
       Concat("eltwise_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/fully_connected.h b/mace/kernels/opencl/image/fully_connected.h
index d0d921d8..605c9ee9 100644
--- a/mace/kernels/opencl/image/fully_connected.h
+++ b/mace/kernels/opencl/image/fully_connected.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
 #define MACE_KERNELS_OPENCL_IMAGE_FULLY_CONNECTED_H_
 
-#include "mace/kernels/fully_connected.h"
+#include "mace/kernels/opencl/fully_connected.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,14 +34,13 @@ template <typename T>
 class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *weight,
       const Tensor *bias,
       const ActivationType activation,
       const float relux_max_limit,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -50,14 +51,13 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
 
 template <typename T>
 MaceStatus FullyConnectedKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const Tensor *weight,
     const Tensor *bias,
     const ActivationType activation,
     const float relux_max_limit,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
@@ -170,8 +170,8 @@ MaceStatus FullyConnectedKernel<T>::Compute(
   MACE_OUT_OF_RANGE_VALIDATION;
   MACE_CL_RET_STATUS(error);
 
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -179,7 +179,7 @@ MaceStatus FullyConnectedKernel<T>::Compute(
     };
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/image_to_buffer.h b/mace/kernels/opencl/image/image_to_buffer.h
index 0a345bf5..da8667f0 100644
--- a/mace/kernels/opencl/image/image_to_buffer.h
+++ b/mace/kernels/opencl/image/image_to_buffer.h
@@ -19,7 +19,8 @@
 #include <string>
 #include <vector>
 
-#include "mace/kernels/buffer_inverse_transform.h"
+#include "mace/core/op_context.h"
+#include "mace/kernels/opencl/buffer_inverse_transform.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -30,12 +31,11 @@ namespace image {
 template <typename T>
 class ImageToBuffer : public OpenCLBufferInverseTransformKernel {
  public:
-  MaceStatus Compute(OpKernelContext *context,
+  MaceStatus Compute(OpContext *context,
                      const Tensor *input,
                      const BufferType type,
                      const int wino_blk_size,
-                     Tensor *output,
-                     StatsFuture *future) override;
+                     Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -43,12 +43,11 @@ class ImageToBuffer : public OpenCLBufferInverseTransformKernel {
 };
 
 template <typename T>
-MaceStatus ImageToBuffer<T>::Compute(OpKernelContext *context,
+MaceStatus ImageToBuffer<T>::Compute(OpContext *context,
                                      const Tensor *input,
                                      const BufferType type,
                                      const int wino_blk_size,
-                                     Tensor *output,
-                                     StatsFuture *future) {
+                                     Tensor *output) {
   auto formatted_buffer_shape = FormatBufferShape(input->shape(), type);
   std::vector<size_t> image_shape;
   CalImage2DShape(formatted_buffer_shape, type, &image_shape, wino_blk_size);
@@ -172,8 +171,8 @@ MaceStatus ImageToBuffer<T>::Compute(OpKernelContext *context,
   }
   MACE_CL_RET_STATUS(error);
   MACE_OUT_OF_RANGE_VALIDATION;
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -181,7 +180,7 @@ MaceStatus ImageToBuffer<T>::Compute(OpKernelContext *context,
     };
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/lstm_cell.h b/mace/kernels/opencl/image/lstm_cell.h
index 2b7d41d4..00b07356 100644
--- a/mace/kernels/opencl/image/lstm_cell.h
+++ b/mace/kernels/opencl/image/lstm_cell.h
@@ -14,12 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
 #define MACE_KERNELS_OPENCL_IMAGE_LSTM_CELL_H_
 
+#include "mace/kernels/opencl/lstm_cell.h"
+
 #include <memory>
 #include <vector>
 #include <set>
 #include <string>
 
-#include "mace/kernels/lstmcell.h"
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -34,15 +37,14 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel {
        const T forget_bias)
       : forget_bias_(forget_bias) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *pre_output,
       const Tensor *weight,
       const Tensor *bias,
       const Tensor *pre_cell,
       Tensor *cell,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   T forget_bias_;
@@ -53,15 +55,14 @@ class LSTMCellKernel : public OpenCLLSTMCellKernel {
 
 template <typename T>
 MaceStatus LSTMCellKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const Tensor *pre_output,
     const Tensor *weight,
     const Tensor *bias,
     const Tensor *pre_cell,
     Tensor *cell,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   MACE_CHECK(pre_output->dim_size() == 2 && pre_output->dim(1) % 4 == 0,
              "LSTM hidden units should be a multiple of 4");
 
@@ -126,10 +127,10 @@ MaceStatus LSTMCellKernel<T>::Compute(
   std::string tuning_key =
       Concat("lstmcell_opencl_kernel", output->dim(0), output->dim(1));
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
   MACE_OUT_OF_RANGE_VALIDATION;
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/matmul.h b/mace/kernels/opencl/image/matmul.h
index 75188752..aa688646 100644
--- a/mace/kernels/opencl/image/matmul.h
+++ b/mace/kernels/opencl/image/matmul.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
 #define MACE_KERNELS_OPENCL_IMAGE_MATMUL_H_
 
-#include "mace/kernels/matmul.h"
+#include "mace/kernels/opencl/matmul.h"
 
 #include <functional>
 #include <memory>
@@ -22,6 +22,8 @@
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -33,13 +35,12 @@ template <typename T>
 class MatMulKernel : public OpenCLMatMulKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *A,
       const Tensor *B,
       Tensor *C,
       bool transpose_a,
-      bool transpose_b,
-      StatsFuture *future) override;
+      bool transpose_b) override;
 
  private:
   cl::Kernel kernel_;
@@ -48,14 +49,12 @@ class MatMulKernel : public OpenCLMatMulKernel {
 
 template <typename T>
 MaceStatus MatMulKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *A,
     const Tensor *B,
     Tensor *C,
     bool transpose_a,
-    bool transpose_b,
-    StatsFuture *future) {
-  MACE_UNUSED(future);
+    bool transpose_b) {
   MACE_CHECK(!transpose_a && !transpose_b,
              "GPU does not support transpose matmul");
 
@@ -115,10 +114,10 @@ MaceStatus MatMulKernel<T>::Compute(
   const std::vector<uint32_t> lws = {kwg_size_ / 64, 64, 0};
   std::string tuning_key = Concat("matmul_opencl_kernel", batch, height, width);
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/pad.h b/mace/kernels/opencl/image/pad.h
index 1533b6d2..b9673e9e 100644
--- a/mace/kernels/opencl/image/pad.h
+++ b/mace/kernels/opencl/image/pad.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_PAD_H_
 #define MACE_KERNELS_OPENCL_IMAGE_PAD_H_
 
-#include "mace/kernels/pad.h"
+#include "mace/kernels/opencl/pad.h"
 
 #include <memory>
 #include <vector>
 #include <set>
 #include <string>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -36,10 +38,9 @@ class PadKernel : public OpenCLPadKernel {
       : paddings_(paddings), constant_value_(constant_value) {}
 
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   std::vector<int> paddings_;
@@ -51,10 +52,9 @@ class PadKernel : public OpenCLPadKernel {
 
 template <typename T>
 MaceStatus PadKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   MACE_CHECK(this->paddings_.size() ==
       static_cast<size_t>((input->dim_size() * 2)));
   MACE_CHECK((this->paddings_[0] == 0) && (this->paddings_[1] == 0) &&
@@ -122,10 +122,10 @@ MaceStatus PadKernel<T>::Compute(
   std::string tuning_key = Concat("pad", output->dim(0), output->dim(1),
                                   output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/pooling.h b/mace/kernels/opencl/image/pooling.h
index 8b11475e..769f3cf8 100644
--- a/mace/kernels/opencl/image/pooling.h
+++ b/mace/kernels/opencl/image/pooling.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
 #define MACE_KERNELS_OPENCL_IMAGE_POOLING_H_
 
-#include "mace/kernels/pooling.h"
+#include "mace/kernels/opencl/pooling.h"
 
 #include <algorithm>
 #include <memory>
@@ -22,6 +22,8 @@
 #include <set>
 #include <string>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -59,7 +61,7 @@ template <typename T>
 class PoolingKernel : public OpenCLPoolingKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const PoolingType pooling_type,
       const int *kernels,
@@ -67,8 +69,7 @@ class PoolingKernel : public OpenCLPoolingKernel {
       const Padding &padding_type,
       const std::vector<int> &padding_data,
       const int *dilations,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -78,7 +79,7 @@ class PoolingKernel : public OpenCLPoolingKernel {
 
 template <typename T>
 MaceStatus PoolingKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
     const PoolingType pooling_type,
     const int *kernels,
@@ -86,8 +87,7 @@ MaceStatus PoolingKernel<T>::Compute(
     const Padding &padding_type,
     const std::vector<int> &padding_data,
     const int *dilations,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   MACE_CHECK(dilations[0] == 1 && dilations[1] == 1)
     << "Pooling opencl kernel not support dilation yet";
 
@@ -173,10 +173,10 @@ MaceStatus PoolingKernel<T>::Compute(
       Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/reduce_mean.h b/mace/kernels/opencl/image/reduce_mean.h
index 3c826540..7d7c5fba 100644
--- a/mace/kernels/opencl/image/reduce_mean.h
+++ b/mace/kernels/opencl/image/reduce_mean.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
 #define MACE_KERNELS_OPENCL_IMAGE_REDUCE_MEAN_H_
 
-#include "mace/kernels/reduce_mean.h"
+#include "mace/kernels/opencl/reduce_mean.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -36,10 +38,9 @@ class ReduceMeanKernel : public OpenCLReduceMeanKernel {
       : axis_(axis), keep_dims_(keep_dims) {}
 
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   const std::vector<int> axis_;
@@ -51,10 +52,9 @@ class ReduceMeanKernel : public OpenCLReduceMeanKernel {
 
 template <typename T>
 MaceStatus ReduceMeanKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   MACE_CHECK_NOTNULL(input);
 //  MACE_CHECK(keep_dims_, "reduce mean gpu only support keep dims.");
   MACE_CHECK(input->dim_size() == 4,
@@ -157,8 +157,8 @@ MaceStatus ReduceMeanKernel<T>::Compute(
   MACE_CL_RET_STATUS(error);
   MACE_OUT_OF_RANGE_VALIDATION;
 
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -166,7 +166,7 @@ MaceStatus ReduceMeanKernel<T>::Compute(
     };
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/resize_bicubic.h b/mace/kernels/opencl/image/resize_bicubic.h
index 669f6445..20d062ac 100644
--- a/mace/kernels/opencl/image/resize_bicubic.h
+++ b/mace/kernels/opencl/image/resize_bicubic.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
 #define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BICUBIC_H_
 
-#include "mace/kernels/resize_bicubic.h"
+#include "mace/kernels/opencl/resize_bicubic.h"
 
 #include <algorithm>
 #include <memory>
@@ -22,7 +22,10 @@
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/kernels/resize_bicubic.h"
 
 namespace mace {
 namespace kernels {
@@ -68,10 +71,9 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
         out_width_(out_width) {}
 
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   bool align_corners_;
@@ -84,10 +86,9 @@ class ResizeBicubicKernel : public OpenCLResizeBicubicKernel {
 
 template <typename T>
 MaceStatus ResizeBicubicKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t batch = input->dim(0);
   const index_t in_height = input->dim(1);
   const index_t in_width = input->dim(2);
@@ -113,7 +114,9 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
     built_options.emplace("-Dresize_bicubic_nocache=" + kernel_name);
     built_options.emplace("-DDATA_TYPE=" + DtToUpCompatibleCLDt(dt));
     built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpCompatibleCLCMDDt(dt));
-    built_options.emplace(MakeString("-DTABLE_SIZE=", kTableSize));
+    built_options.emplace(
+        MakeString("-DTABLE_SIZE=",
+                   mace::kernels::resize_bicubic::kTableSize));
     MACE_RETURN_IF_ERROR(
         runtime->BuildKernel("resize_bicubic",
                              kernel_name,
@@ -135,9 +138,11 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
     MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
     float height_scale =
-        CalculateResizeScale(in_height, out_height, align_corners_);
+        mace::kernels::resize_bicubic::CalculateResizeScale(
+            in_height, out_height, align_corners_);
     float width_scale =
-        CalculateResizeScale(in_width, out_width, align_corners_);
+        mace::kernels::resize_bicubic::CalculateResizeScale(
+            in_width, out_width, align_corners_);
 
     uint32_t idx = 0;
     MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
@@ -159,10 +164,10 @@ MaceStatus ResizeBicubicKernel<T>::Compute(
       Concat("resize_bicubic_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/resize_bilinear.h b/mace/kernels/opencl/image/resize_bilinear.h
index 459babc9..d34b7d50 100644
--- a/mace/kernels/opencl/image/resize_bilinear.h
+++ b/mace/kernels/opencl/image/resize_bilinear.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
 #define MACE_KERNELS_OPENCL_IMAGE_RESIZE_BILINEAR_H_
 
-#include "mace/kernels/resize_bilinear.h"
+#include "mace/kernels/opencl/resize_bilinear.h"
 
 #include <algorithm>
 #include <memory>
@@ -22,7 +22,10 @@
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/kernels/resize_bilinear.h"
 
 namespace mace {
 namespace kernels {
@@ -73,10 +76,9 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
         out_width_(out_width) {}
 
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   bool align_corners_;
@@ -89,10 +91,9 @@ class ResizeBilinearKernel : public OpenCLResizeBilinearKernel {
 
 template <typename T>
 MaceStatus ResizeBilinearKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t batch = input->dim(0);
   const index_t in_height = input->dim(1);
   const index_t in_width = input->dim(2);
@@ -138,9 +139,13 @@ MaceStatus ResizeBilinearKernel<T>::Compute(
     MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape));
 
     float height_scale =
-        CalculateResizeScale(in_height, out_height, align_corners_);
+        mace::kernels::resize_bilinear::CalculateResizeScale(in_height,
+                                                             out_height,
+                                                             align_corners_);
     float width_scale =
-        CalculateResizeScale(in_width, out_width, align_corners_);
+        mace::kernels::resize_bilinear::CalculateResizeScale(in_width,
+                                                             out_width,
+                                                             align_corners_);
 
     uint32_t idx = 0;
     MACE_OUT_OF_RANGE_SET_ARGS(kernel_);
@@ -162,10 +167,10 @@ MaceStatus ResizeBilinearKernel<T>::Compute(
       Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1),
              output->dim(2), output->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/softmax.h b/mace/kernels/opencl/image/softmax.h
index 0c3aa618..cf2dd5b4 100644
--- a/mace/kernels/opencl/image/softmax.h
+++ b/mace/kernels/opencl/image/softmax.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
 #define MACE_KERNELS_OPENCL_IMAGE_SOFTMAX_H_
 
-#include "mace/kernels/softmax.h"
+#include "mace/kernels/opencl/softmax.h"
 
 #include <algorithm>
 #include <memory>
@@ -22,6 +22,8 @@
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -58,10 +60,9 @@ template <typename T>
 class SoftmaxKernel : public OpenCLSoftmaxKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *logits,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -71,10 +72,9 @@ class SoftmaxKernel : public OpenCLSoftmaxKernel {
 
 template <typename T>
 MaceStatus SoftmaxKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *logits,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   index_t batch = 0;
   index_t height = 0;
   index_t width = 0;
@@ -137,10 +137,10 @@ MaceStatus SoftmaxKernel<T>::Compute(
   std::string tuning_key =
       Concat("softmax_opencl_kernel", batch, height, width, channels);
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/space_to_batch.h b/mace/kernels/opencl/image/space_to_batch.h
index 89bcdf6a..0a20e6f6 100644
--- a/mace/kernels/opencl/image/space_to_batch.h
+++ b/mace/kernels/opencl/image/space_to_batch.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
 #define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_BATCH_H_
 
-#include "mace/kernels/space_to_batch.h"
+#include "mace/kernels/opencl/space_to_batch.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,13 +34,12 @@ template <typename T>
 class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
  public:
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *space_tensor,
       const std::vector<int> &paddings,
       const std::vector<int> &block_shape,
       const std::vector<index_t> &output_shape,
-      Tensor *batch_tensor,
-      StatsFuture *future) override;
+      Tensor *batch_tensor) override;
 
  private:
   cl::Kernel kernel_;
@@ -48,13 +49,12 @@ class SpaceToBatchKernel : public OpenCLSpaceToBatchKernel {
 
 template <typename T>
 MaceStatus SpaceToBatchKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *space_tensor,
     const std::vector<int> &paddings,
     const std::vector<int> &block_shape,
     const std::vector<index_t> &output_shape,
-    Tensor *batch_tensor,
-    StatsFuture *future) {
+    Tensor *batch_tensor) {
   std::vector<size_t> output_image_shape;
   CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL,
                   &output_image_shape);
@@ -114,10 +114,10 @@ MaceStatus SpaceToBatchKernel<T>::Compute(
       Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1),
              batch_tensor->dim(2), batch_tensor->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/space_to_depth.h b/mace/kernels/opencl/image/space_to_depth.h
index e1247dc3..2e3f2a74 100644
--- a/mace/kernels/opencl/image/space_to_depth.h
+++ b/mace/kernels/opencl/image/space_to_depth.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
 #define MACE_KERNELS_OPENCL_IMAGE_SPACE_TO_DEPTH_H_
 
-#include "mace/kernels/space_to_depth.h"
+#include "mace/kernels/opencl/space_to_depth.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -34,10 +36,9 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
   explicit SpaceToDepthKernel(const int block_size)
       : block_size_(block_size) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   const int block_size_;
@@ -48,10 +49,9 @@ class SpaceToDepthKernel : public OpenCLSpaceToDepthKernel {
 
 template <typename T>
 MaceStatus SpaceToDepthKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   const index_t batch = input->dim(0);
   const index_t input_height = input->dim(1);
   const index_t input_width = input->dim(2);
@@ -124,10 +124,10 @@ MaceStatus SpaceToDepthKernel<T>::Compute(
   std::string tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0),
                                   input->dim(1), input->dim(2), input->dim(3));
   MACE_RETURN_IF_ERROR(TuningOrRun3DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/split.h b/mace/kernels/opencl/image/split.h
index a75642a8..ee7fab71 100644
--- a/mace/kernels/opencl/image/split.h
+++ b/mace/kernels/opencl/image/split.h
@@ -14,7 +14,7 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
 #define MACE_KERNELS_OPENCL_IMAGE_SPLIT_H_
 
-#include "mace/kernels/split.h"
+#include "mace/kernels/opencl/split.h"
 
 #include <algorithm>
 #include <memory>
@@ -22,6 +22,8 @@
 #include <set>
 #include <string>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -34,10 +36,9 @@ class SplitKernel : public OpenCLSplitKernel {
  public:
   explicit SplitKernel(const int32_t axis) : axis_(axis) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
-      const std::vector<Tensor *> &output_list,
-      StatsFuture *future) override;
+      const std::vector<Tensor *> &output_list) override;
 
  private:
   int32_t axis_;
@@ -47,10 +48,9 @@ class SplitKernel : public OpenCLSplitKernel {
 
 template <typename T>
 MaceStatus SplitKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input,
-    const std::vector<Tensor *> &output_list,
-    StatsFuture *future) {
+    const std::vector<Tensor *> &output_list) {
   const index_t input_channels = input->dim(3);
   const size_t outputs_count = output_list.size();
   const index_t output_channels = input_channels / outputs_count;
@@ -123,7 +123,7 @@ MaceStatus SplitKernel<T>::Compute(
     }
     MACE_CL_RET_STATUS(error);
     MACE_OUT_OF_RANGE_VALIDATION;
-    if (future != nullptr && runtime->is_profiling_enabled()) {
+    if (context->future() != nullptr && runtime->is_profiling_enabled()) {
       event.wait();
       CallStats tmp_stats;
       runtime->GetCallStats(event, &tmp_stats);
@@ -132,8 +132,8 @@ MaceStatus SplitKernel<T>::Compute(
       call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros;
     }
   }
-  if (future != nullptr) {
-    future->wait_fn = [runtime, call_stats](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, call_stats](CallStats *stats) {
       if (stats != nullptr) {
         stats->start_micros = call_stats.start_micros;
         stats->end_micros = stats->start_micros + call_stats.end_micros;
@@ -141,7 +141,7 @@ MaceStatus SplitKernel<T>::Compute(
     };
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/sqrdiff_mean.h b/mace/kernels/opencl/image/sqrdiff_mean.h
index 31959a62..3d86b05d 100644
--- a/mace/kernels/opencl/image/sqrdiff_mean.h
+++ b/mace/kernels/opencl/image/sqrdiff_mean.h
@@ -14,13 +14,15 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
 #define MACE_KERNELS_OPENCL_IMAGE_SQRDIFF_MEAN_H_
 
-#include "mace/kernels/sqrdiff_mean.h"
+#include "mace/kernels/opencl/sqrdiff_mean.h"
 
 #include <memory>
 #include <set>
 #include <string>
 #include <vector>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -31,14 +33,11 @@ namespace image {
 template <typename T>
 class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
  public:
-  SqrDiffMeanKernel() {}
-
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input,
       const Tensor *input1,
-      Tensor *output,
-      StatsFuture *future) override;
+      Tensor *output) override;
 
  private:
   cl::Kernel kernel_;
@@ -48,11 +47,10 @@ class SqrDiffMeanKernel : public OpenCLSqrDiffMeanKernel {
 
 template <typename T>
 MaceStatus SqrDiffMeanKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input0,
     const Tensor *input1,
-    Tensor *output,
-    StatsFuture *future) {
+    Tensor *output) {
   MACE_CHECK_NOTNULL(input0);
   MACE_CHECK_NOTNULL(input1);
   MACE_CHECK(input0->dim(0) == input1->dim(0) &&
@@ -156,8 +154,8 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
   MACE_CL_RET_STATUS(error);
   MACE_OUT_OF_RANGE_VALIDATION;
 
-  if (future != nullptr) {
-    future->wait_fn = [runtime, event](CallStats *stats) {
+  if (context->future() != nullptr) {
+    context->future()->wait_fn = [runtime, event](CallStats *stats) {
       event.wait();
       if (stats != nullptr) {
         runtime->GetCallStats(event, stats);
@@ -165,7 +163,7 @@ MaceStatus SqrDiffMeanKernel<T>::Compute(
     };
   }
 
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace image
diff --git a/mace/kernels/opencl/image/winograd_transform.h b/mace/kernels/opencl/image/winograd_transform.h
index 107c8dc0..f00e5556 100644
--- a/mace/kernels/opencl/image/winograd_transform.h
+++ b/mace/kernels/opencl/image/winograd_transform.h
@@ -14,13 +14,17 @@
 #ifndef MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
 #define MACE_KERNELS_OPENCL_IMAGE_WINOGRAD_TRANSFORM_H_
 
-#include "mace/kernels/winograd_transform.h"
+#include "mace/kernels/opencl/winograd_transform.h"
 
 #include <memory>
 #include <vector>
 #include <set>
 #include <string>
 
+#include "mace/core/op_context.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/kernels/opencl/helper.h"
 
 namespace mace {
@@ -32,7 +36,7 @@ template <typename T>
 class WinogradTransformKernel : public OpenCLWinogradTransformKernel {
  public:
   WinogradTransformKernel(
-      const Padding &padding_type,
+      Padding padding_type,
       const std::vector<int> &paddings,
       const int block_size)
       : strides_({1, 1}),
@@ -41,10 +45,9 @@ class WinogradTransformKernel : public OpenCLWinogradTransformKernel {
         paddings_(paddings),
         wino_blk_size_(block_size) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const Tensor *input_tensor,
-      Tensor *output_tensor,
-      StatsFuture *future) override;
+      Tensor *output_tensor) override;
 
  private:
   const std::vector<int> strides_;    // [stride_h, stride_w]
@@ -59,10 +62,9 @@ class WinogradTransformKernel : public OpenCLWinogradTransformKernel {
 
 template <typename T>
 MaceStatus WinogradTransformKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const Tensor *input_tensor,
-    Tensor *output_tensor,
-    StatsFuture *future) {
+    Tensor *output_tensor) {
   auto runtime = context->device()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
@@ -83,7 +85,7 @@ MaceStatus WinogradTransformKernel<T>::Compute(
                                 + obfuscated_kernel_name);
     } else {
       MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
-      return MACE_SUCCESS;
+      return MaceStatus::MACE_SUCCESS;
     }
     built_options.emplace("-DDATA_TYPE=" +
         DtToUpCompatibleCLDt(DataTypeToEnum<T>::value));
@@ -162,10 +164,10 @@ MaceStatus WinogradTransformKernel<T>::Compute(
                                   output_tensor->dim(1),
                                   output_tensor->dim(2));
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 template <typename T>
@@ -173,17 +175,16 @@ class WinogradInverseTransformKernel
     : public OpenCLWinogradInverseTransformKernel {
  public:
   WinogradInverseTransformKernel(
-      const ActivationType activation,
+      ActivationType activation,
       const float relux_max_limit,
       const int block_size)
       : wino_blk_size_(block_size),
         activation_(activation),
         relux_max_limit_(relux_max_limit) {}
   MaceStatus Compute(
-      OpKernelContext *context,
+      OpContext *context,
       const std::vector<const Tensor*> &inputs,
-      Tensor *output_tensor,
-      StatsFuture *future) override;
+      Tensor *output_tensor) override;
 
  private:
   const int wino_blk_size_;
@@ -196,10 +197,9 @@ class WinogradInverseTransformKernel
 
 template <typename T>
 MaceStatus WinogradInverseTransformKernel<T>::Compute(
-    OpKernelContext *context,
+    OpContext *context,
     const std::vector<const Tensor*> &inputs,
-    Tensor *output_tensor,
-    StatsFuture *future) {
+    Tensor *output_tensor) {
   auto runtime = context->device()->opencl_runtime();
   MACE_OUT_OF_RANGE_DEFINITION;
 
@@ -223,7 +223,7 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
                                 + obfuscated_kernel_name);
     } else {
       MACE_CHECK(false, "mace only supports 4x4 and 2x2 gpu winograd.");
-      return MACE_SUCCESS;
+      return MaceStatus::MACE_SUCCESS;
     }
 
     built_options.emplace("-DDATA_TYPE=" +
@@ -312,10 +312,10 @@ MaceStatus WinogradInverseTransformKernel<T>::Compute(
              output_tensor->dim(1), output_tensor->dim(2),
              output_tensor->dim(3), input_tensor->dim(2));
   MACE_RETURN_IF_ERROR(TuningOrRun2DKernel(runtime, kernel_, tuning_key,
-                                           gws, lws, future));
+                                           gws, lws, context->future()));
 
   MACE_OUT_OF_RANGE_VALIDATION;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 }  // namespace image
 }  // namespace opencl
diff --git a/mace/kernels/opencl/lstm_cell.h b/mace/kernels/opencl/lstm_cell.h
new file mode 100644
index 00000000..0ce1d26f
--- /dev/null
+++ b/mace/kernels/opencl/lstm_cell.h
@@ -0,0 +1,44 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_LSTM_CELL_H_
+#define MACE_KERNELS_OPENCL_LSTM_CELL_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLLSTMCellKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const Tensor *pre_output,
+      const Tensor *weight,
+      const Tensor *bias,
+      const Tensor *pre_cell,
+      Tensor *cell,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLLSTMCellKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_LSTM_CELL_H_
diff --git a/mace/kernels/opencl/lstmcell.cc b/mace/kernels/opencl/lstmcell.cc
deleted file mode 100644
index e210ee58..00000000
--- a/mace/kernels/opencl/lstmcell.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/lstmcell.h"
-#include "mace/kernels/opencl/image/lstm_cell.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-LSTMCellFunctor<DeviceType::GPU, T>::LSTMCellFunctor(
-    OpKernelContext *context,
-    T forget_bias)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::LSTMCellKernel<T>(forget_bias));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus LSTMCellFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const Tensor *pre_output,
-    const Tensor *weight,
-    const Tensor *bias,
-    const Tensor *pre_cell,
-    Tensor *cell,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, pre_output, weight, bias,
-                          pre_cell, cell, output, future);
-}
-
-template struct LSTMCellFunctor<DeviceType::GPU, float>;
-template struct LSTMCellFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc
deleted file mode 100644
index b8ddc1c8..00000000
--- a/mace/kernels/opencl/matmul.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/matmul.h"
-#include "mace/kernels/opencl/image/matmul.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-MatMulFunctor<DeviceType::GPU, T>::MatMulFunctor(OpKernelContext *context)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::MatMulKernel<T>);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus MatMulFunctor<DeviceType::GPU, T>::operator()(const Tensor *A,
-                                                         const Tensor *B,
-                                                         Tensor *C,
-                                                         bool transpose_a,
-                                                         bool transpose_b,
-                                                         StatsFuture *future) {
-  return kernel_->Compute(context_, A, B, C, transpose_a, transpose_b, future);
-}
-
-template struct MatMulFunctor<DeviceType::GPU, float>;
-
-template struct MatMulFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/matmul.h b/mace/kernels/opencl/matmul.h
new file mode 100644
index 00000000..e971328e
--- /dev/null
+++ b/mace/kernels/opencl/matmul.h
@@ -0,0 +1,42 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_MATMUL_H_
+#define MACE_KERNELS_OPENCL_MATMUL_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLMatMulKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *A,
+      const Tensor *B,
+      Tensor *C,
+      bool transpose_a,
+      bool transpose_b) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLMatMulKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_MATMUL_H_
diff --git a/mace/kernels/opencl/out_of_range_check_test.cc b/mace/kernels/opencl/out_of_range_check_test.cc
index f61b9e87..957026b2 100644
--- a/mace/kernels/opencl/out_of_range_check_test.cc
+++ b/mace/kernels/opencl/out_of_range_check_test.cc
@@ -16,7 +16,7 @@
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "mace/core/op_kernel_context.h"
+#include "mace/core/op_context.h"
 #include "mace/core/runtime/opencl/gpu_device.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/tensor.h"
@@ -27,10 +27,10 @@ namespace mace {
 namespace kernels {
 namespace {
 
-bool BufferToImageOpImpl(OpKernelContext *context,
-                         Tensor *buffer,
-                         Tensor *image,
-                         const std::vector<size_t> &image_shape) {
+MaceStatus BufferToImageOpImpl(OpContext *context,
+                               Tensor *buffer,
+                               Tensor *image,
+                               const std::vector<size_t> &image_shape) {
   std::unique_ptr<BufferBase> oorc_flag;
   uint32_t gws[2] = {static_cast<uint32_t>(image_shape[0]),
                      static_cast<uint32_t>(image_shape[1])};
@@ -59,14 +59,10 @@ bool BufferToImageOpImpl(OpKernelContext *context,
   }
 
   cl::Kernel kernel;
-  cl_int error = runtime->BuildKernel("buffer_to_image",
-                                      obfuscated_kernel_name,
-                                      built_options,
-                                      &kernel);
-  if (error != CL_SUCCESS) {
-    return false;
-  }
-
+  MACE_RETURN_IF_ERROR(runtime->BuildKernel("buffer_to_image",
+                                            obfuscated_kernel_name,
+                                            built_options,
+                                            &kernel));
   MACE_OUT_OF_RANGE_INIT(kernel);
   uint32_t idx = 0;
   if (runtime->IsOutOfRangeCheckEnabled()) {
@@ -89,6 +85,7 @@ bool BufferToImageOpImpl(OpKernelContext *context,
       static_cast<uint32_t>(runtime->GetKernelMaxWorkGroupSize(kernel));
   const std::vector<uint32_t> lws = {16, kwg_size / 16};
 
+  cl_int error;
   cl::Event event;
   if (runtime->IsNonUniformWorkgroupsSupported()) {
     error = runtime->command_queue().enqueueNDRangeKernel(
@@ -105,7 +102,7 @@ bool BufferToImageOpImpl(OpKernelContext *context,
         cl::NDRange(lws[0], lws[1]), nullptr, &event);
   }
   if (error != CL_SUCCESS) {
-    return false;
+    return MaceStatus::MACE_OUT_OF_RESOURCES;
   }
 
   runtime->command_queue().finish();
@@ -115,7 +112,8 @@ bool BufferToImageOpImpl(OpKernelContext *context,
     is_out_of_range = *(oorc_flag->mutable_data<char>()) == 1 ? true : false;
     oorc_flag->UnMap();
   }
-  return is_out_of_range;
+  return is_out_of_range ? MaceStatus::MACE_OUT_OF_RESOURCES
+                         : MaceStatus::MACE_SUCCESS;
 }
 
 }  // namespace
@@ -135,7 +133,7 @@ TEST(OutOfRangeCheckTest, RandomTest) {
   std::unique_ptr<Device> device(new GPUDevice(gpu_context.opencl_tuner()));
 
   Workspace ws;
-  OpKernelContext context(&ws, device.get());
+  OpContext context(&ws, device.get());
 
   std::vector<index_t> buffer_shape = {batch, height, width, channels};
   Tensor *buffer =
@@ -148,7 +146,8 @@ TEST(OutOfRangeCheckTest, RandomTest) {
                                   DataTypeToEnum<float>::v());
   CalImage2DShape(buffer->shape(), IN_OUT_CHANNEL, &image_shape);
   image->ResizeImage(buffer->shape(), image_shape);
-  ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape));
+  ASSERT_FALSE(BufferToImageOpImpl(&context, buffer, image, image_shape)
+                   != MaceStatus::MACE_SUCCESS);
 
   std::vector<size_t> overflow_image_shape = image_shape;
   for (size_t i = 0; i < overflow_image_shape.size(); ++i) {
@@ -157,7 +156,8 @@ TEST(OutOfRangeCheckTest, RandomTest) {
   ASSERT_TRUE(BufferToImageOpImpl(&context,
                                   buffer,
                                   image,
-                                  overflow_image_shape));
+                                  overflow_image_shape)
+                  != MaceStatus::MACE_SUCCESS);
 }
 
 }  // namespace kernels
diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc
deleted file mode 100644
index 759b9219..00000000
--- a/mace/kernels/opencl/pad.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/pad.h"
-#include "mace/kernels/opencl/image/pad.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-PadFunctor<DeviceType::GPU, T>::PadFunctor(
-    OpKernelContext *context,
-    const std::vector<int> &paddings,
-    const float constant_value)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::PadKernel<T>(paddings, constant_value));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus PadFunctor<DeviceType::GPU, T>::operator()(const Tensor *input,
-                                                      Tensor *output,
-                                                      StatsFuture *future) {
-  return kernel_->Compute(context_, input, output, future);
-}
-
-template struct PadFunctor<DeviceType::GPU, float>;
-template struct PadFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/pad.h b/mace/kernels/opencl/pad.h
new file mode 100644
index 00000000..ec91a446
--- /dev/null
+++ b/mace/kernels/opencl/pad.h
@@ -0,0 +1,38 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_PAD_H_
+#define MACE_KERNELS_OPENCL_PAD_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLPadKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLPadKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_PAD_H_
diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc
deleted file mode 100644
index aab53664..00000000
--- a/mace/kernels/opencl/pooling.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/pooling.h"
-
-#include "mace/kernels/opencl/buffer/pooling.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/kernels/opencl/image/pooling.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-PoolingFunctor<DeviceType::GPU, T>::PoolingFunctor(
-    OpKernelContext *context,
-    const PoolingType pooling_type,
-    const int *kernels,
-    const int *strides,
-    const Padding padding_type,
-    const std::vector<int> &paddings,
-    const int *dilations)
-    : PoolingFunctorBase(context,
-                         pooling_type,
-                         kernels,
-                         strides,
-                         padding_type,
-                         paddings,
-                         dilations) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::PoolingKernel<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::PoolingKernel<T>);
-  }
-}
-
-template <typename T>
-MaceStatus PoolingFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, pooling_type_, kernels_, strides_,
-                          padding_type_, paddings_, dilations_,
-                          output, future);
-}
-
-template struct PoolingFunctor<DeviceType::GPU, float>;
-template struct PoolingFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/pooling.h b/mace/kernels/opencl/pooling.h
new file mode 100644
index 00000000..ce3c8b54
--- /dev/null
+++ b/mace/kernels/opencl/pooling.h
@@ -0,0 +1,46 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_POOLING_H_
+#define MACE_KERNELS_OPENCL_POOLING_H_
+
+#include <vector>
+
+#include "mace/kernels/pooling.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+namespace kernels {
+class OpenCLPoolingKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const PoolingType pooling_type,
+      const int *kernels,
+      const int *strides,
+      const Padding &padding_type,
+      const std::vector<int> &padding_data,
+      const int *dilations,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLPoolingKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_POOLING_H_
diff --git a/mace/kernels/opencl/reduce_mean.cc b/mace/kernels/opencl/reduce_mean.cc
deleted file mode 100644
index b504334a..00000000
--- a/mace/kernels/opencl/reduce_mean.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/reduce_mean.h"
-#include "mace/kernels/opencl/image/reduce_mean.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-ReduceMeanFunctor<DeviceType::GPU, T>::ReduceMeanFunctor(
-    OpKernelContext *context,
-    const std::vector<int> &axis,
-    const bool keep_dims) : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::ReduceMeanKernel<T>(axis, keep_dims));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus ReduceMeanFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, output, future);
-}
-
-template struct ReduceMeanFunctor<DeviceType::GPU, float>;
-template struct ReduceMeanFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/reduce_mean.h b/mace/kernels/opencl/reduce_mean.h
new file mode 100644
index 00000000..1960aac5
--- /dev/null
+++ b/mace/kernels/opencl/reduce_mean.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_REDUCE_MEAN_H_
+#define MACE_KERNELS_OPENCL_REDUCE_MEAN_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLReduceMeanKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLReduceMeanKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_REDUCE_MEAN_H_
diff --git a/mace/kernels/opencl/resize_bicubic.cc b/mace/kernels/opencl/resize_bicubic.cc
deleted file mode 100644
index e45ced4b..00000000
--- a/mace/kernels/opencl/resize_bicubic.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/resize_bicubic.h"
-#include "mace/kernels/opencl/image/resize_bicubic.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-ResizeBicubicFunctor<DeviceType::GPU, T>::ResizeBicubicFunctor(
-    OpKernelContext *context,
-    bool align_corners,
-    const std::vector<index_t> &size)
-    : OpKernel(context) {
-  MACE_CHECK(size.size() == 2);
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::ResizeBicubicKernel<T>(align_corners,
-                                                            size[0],
-                                                            size[1]));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus ResizeBicubicFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
-  return kernel_->Compute(context_, input, output, future);
-}
-
-template struct ResizeBicubicFunctor<DeviceType::GPU, float>;
-template struct ResizeBicubicFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/resize_bicubic.h b/mace/kernels/opencl/resize_bicubic.h
new file mode 100644
index 00000000..bfb6f8b5
--- /dev/null
+++ b/mace/kernels/opencl/resize_bicubic.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_
+#define MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+#include "mace/core/types.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLResizeBicubicKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeBicubicKernel);
+};
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_RESIZE_BICUBIC_H_
diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc
deleted file mode 100644
index 585cab76..00000000
--- a/mace/kernels/opencl/resize_bilinear.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/resize_bilinear.h"
-#include "mace/kernels/opencl/image/resize_bilinear.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-ResizeBilinearFunctor<DeviceType::GPU, T>::ResizeBilinearFunctor(
-    OpKernelContext *context,
-    const std::vector<index_t> &size,
-    bool align_corners) : OpKernel(context) {
-  MACE_CHECK(size.size() == 2);
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::ResizeBilinearKernel<T>(align_corners,
-                                                             size[0],
-                                                             size[1]));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-template <typename T>
-MaceStatus ResizeBilinearFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
-  return kernel_->Compute(context_, input, output, future);
-}
-
-template struct ResizeBilinearFunctor<DeviceType::GPU, float>;
-template struct ResizeBilinearFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/resize_bilinear.h b/mace/kernels/opencl/resize_bilinear.h
new file mode 100644
index 00000000..f60fb282
--- /dev/null
+++ b/mace/kernels/opencl/resize_bilinear.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_
+#define MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_
+
+#include "mace/core/types.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLResizeBilinearKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLResizeBilinearKernel);
+};
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_RESIZE_BILINEAR_H_
diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc
deleted file mode 100644
index bad5f1fa..00000000
--- a/mace/kernels/opencl/softmax.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/softmax.h"
-
-#include "mace/kernels/opencl/buffer/softmax.h"
-#include "mace/kernels/opencl/helper.h"
-#include "mace/kernels/opencl/image/softmax.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-SoftmaxFunctor<DeviceType::GPU, T>::SoftmaxFunctor(OpKernelContext *context)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::SoftmaxKernel<T>);
-  } else {
-    kernel_.reset(new opencl::buffer::SoftmaxKernel<T>);
-  }
-}
-template <typename T>
-MaceStatus SoftmaxFunctor<DeviceType::GPU, T>::operator()(const Tensor *logits,
-                                                          Tensor *output,
-                                                          StatsFuture *future) {
-  return kernel_->Compute(context_, logits, output, future);
-}
-
-template struct SoftmaxFunctor<DeviceType::GPU, float>;
-template struct SoftmaxFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/softmax.h b/mace/kernels/opencl/softmax.h
new file mode 100644
index 00000000..308b606e
--- /dev/null
+++ b/mace/kernels/opencl/softmax.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_SOFTMAX_H_
+#define MACE_KERNELS_OPENCL_SOFTMAX_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLSoftmaxKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *logits,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSoftmaxKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_SOFTMAX_H_
diff --git a/mace/kernels/opencl/space_to_batch.cc b/mace/kernels/opencl/space_to_batch.cc
deleted file mode 100644
index c69db85c..00000000
--- a/mace/kernels/opencl/space_to_batch.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
-#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
-
-#include "mace/kernels/space_to_batch.h"
-#include "mace/kernels/opencl/image/space_to_batch.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-SpaceToBatchFunctor<DeviceType::GPU, T>::SpaceToBatchFunctor(
-    OpKernelContext *context,
-    const std::vector<int> &paddings,
-    const std::vector<int> &block_shape)
-    : SpaceToBatchFunctorBase(context, paddings, block_shape) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::SpaceToBatchKernel<T>);
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus SpaceToBatchFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *space_tensor, Tensor *batch_tensor, StatsFuture *future) {
-  std::vector<index_t> output_shape(4, 0);
-  CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
-                                   output_shape.data());
-  return kernel_->Compute(context_, space_tensor, paddings_, block_shape_,
-                          output_shape, batch_tensor, future);
-}
-
-template struct SpaceToBatchFunctor<DeviceType::GPU, float>;
-template struct SpaceToBatchFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
-#endif  // MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
diff --git a/mace/ops/identity.h b/mace/kernels/opencl/space_to_batch.h
similarity index 51%
rename from mace/ops/identity.h
rename to mace/kernels/opencl/space_to_batch.h
index be4d75bf..22d308ac 100644
--- a/mace/ops/identity.h
+++ b/mace/kernels/opencl/space_to_batch.h
@@ -12,36 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_IDENTITY_H_
-#define MACE_OPS_IDENTITY_H_
+#ifndef MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
+#define MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
 
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/types.h"
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
 
 namespace mace {
-namespace ops {
 
-template <DeviceType D, typename T>
-class IdentityOp : public Operator<D, T> {
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLSpaceToBatchKernel {
  public:
-  IdentityOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-    output->ReuseTensorBuffer(*input);
-    SetFutureDefaultWaitFn(future);
-    return MACE_SUCCESS;
-  }
-
- private:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *space_tensor,
+      const std::vector<int> &paddings,
+      const std::vector<int> &block_shape,
+      const std::vector<index_t> &output_shape,
+      Tensor *batch_tensor) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSpaceToBatchKernel);
 };
 
-}  // namespace ops
+}  // namespace kernels
 }  // namespace mace
 
-#endif  // MACE_OPS_IDENTITY_H_
+#endif  // MACE_KERNELS_OPENCL_SPACE_TO_BATCH_H_
diff --git a/mace/kernels/opencl/space_to_depth.cc b/mace/kernels/opencl/space_to_depth.cc
deleted file mode 100644
index 3e14047b..00000000
--- a/mace/kernels/opencl/space_to_depth.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/space_to_depth.h"
-#include "mace/kernels/opencl/image/space_to_depth.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-SpaceToDepthOpFunctor<DeviceType::GPU, T>::SpaceToDepthOpFunctor(
-    OpKernelContext *context,
-    const int block_size)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::SpaceToDepthKernel<T>(block_size));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus SpaceToDepthOpFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input, Tensor *output, StatsFuture *future) {
-  return kernel_->Compute(context_, input, output, future);
-}
-
-template struct SpaceToDepthOpFunctor<DeviceType::GPU, float>;
-template struct SpaceToDepthOpFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/space_to_depth.h b/mace/kernels/opencl/space_to_depth.h
new file mode 100644
index 00000000..ea6b16c1
--- /dev/null
+++ b/mace/kernels/opencl/space_to_depth.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_
+#define MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLSpaceToDepthKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSpaceToDepthKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_SPACE_TO_DEPTH_H_
diff --git a/mace/kernels/opencl/split.cc b/mace/kernels/opencl/split.cc
deleted file mode 100644
index 2f2a046e..00000000
--- a/mace/kernels/opencl/split.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/split.h"
-#include "mace/kernels/opencl/image/split.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-SplitFunctor<DeviceType::GPU, T>::SplitFunctor(OpKernelContext *context,
-                                               const int32_t axis)
-    : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::SplitKernel<T>(axis));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus SplitFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input,
-    const std::vector<Tensor *> &output_list,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input, output_list, future);
-}
-
-template struct SplitFunctor<DeviceType::GPU, float>;
-template struct SplitFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/split.h b/mace/kernels/opencl/split.h
new file mode 100644
index 00000000..c5cacd6f
--- /dev/null
+++ b/mace/kernels/opencl/split.h
@@ -0,0 +1,41 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_SPLIT_H_
+#define MACE_KERNELS_OPENCL_SPLIT_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLSplitKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      const std::vector<Tensor *> &output_list) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSplitKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_SPLIT_H_
diff --git a/mace/kernels/opencl/sqrdiff_mean.cc b/mace/kernels/opencl/sqrdiff_mean.cc
deleted file mode 100644
index a0a6401d..00000000
--- a/mace/kernels/opencl/sqrdiff_mean.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/sqrdiff_mean.h"
-#include "mace/kernels/opencl/image/sqrdiff_mean.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-SqrDiffMeanFunctor<DeviceType::GPU, T>::SqrDiffMeanFunctor(
-    OpKernelContext *context) : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::SqrDiffMeanKernel<T>());
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-
-template <typename T>
-MaceStatus SqrDiffMeanFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input0,
-    const Tensor *input1,
-    Tensor *output,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, input0, input1, output, future);
-}
-
-template struct SqrDiffMeanFunctor<DeviceType::GPU, float>;
-template struct SqrDiffMeanFunctor<DeviceType::GPU, half>;
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/sqrdiff_mean.h b/mace/kernels/opencl/sqrdiff_mean.h
new file mode 100644
index 00000000..c2d5d197
--- /dev/null
+++ b/mace/kernels/opencl/sqrdiff_mean.h
@@ -0,0 +1,39 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_
+#define MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+class OpenCLSqrDiffMeanKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input0,
+      const Tensor *input1,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLSqrDiffMeanKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_SQRDIFF_MEAN_H_
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
deleted file mode 100644
index f64945a4..00000000
--- a/mace/kernels/opencl/winograd_transform.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/kernels/winograd_transform.h"
-#include "mace/kernels/opencl/image/winograd_transform.h"
-
-namespace mace {
-namespace kernels {
-
-template <typename T>
-WinogradTransformFunctor<DeviceType::GPU, T>::WinogradTransformFunctor(
-    OpKernelContext *context,
-    const Padding &padding_type,
-    const std::vector<int> &paddings,
-    const int block_size) : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::WinogradTransformKernel<T>(
-        padding_type, paddings, block_size));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-template <typename T>
-MaceStatus WinogradTransformFunctor<DeviceType::GPU, T>::operator()(
-    const Tensor *input_tensor, Tensor *output_tensor, StatsFuture *future) {
-  return kernel_->Compute(context_, input_tensor, output_tensor, future);
-}
-
-template <typename T>
-WinogradInverseTransformFunctor<DeviceType::GPU, T>::WinogradInverseTransformFunctor(  // NOLINT(whitespace/line_length)
-    OpKernelContext *context,
-    const ActivationType activation,
-    const float relux_max_limit,
-    const int block_size) : OpKernel(context) {
-  if (context->device()->opencl_runtime()->UseImageMemory()) {
-    kernel_.reset(new opencl::image::WinogradInverseTransformKernel<T>(
-        activation, relux_max_limit, block_size));
-  } else {
-    MACE_NOT_IMPLEMENTED;
-  }
-}
-template <typename T>
-MaceStatus WinogradInverseTransformFunctor<DeviceType::GPU, T>::operator()(
-    const std::vector<const Tensor*> &inputs,
-    Tensor *output_tensor,
-    StatsFuture *future) {
-  return kernel_->Compute(context_, inputs, output_tensor, future);
-}
-
-template struct WinogradTransformFunctor<DeviceType::GPU, float>;
-template struct WinogradTransformFunctor<DeviceType::GPU, half>;
-
-template struct WinogradInverseTransformFunctor<DeviceType::GPU, float>;
-template struct WinogradInverseTransformFunctor<DeviceType::GPU, half>;
-
-}  // namespace kernels
-}  // namespace mace
diff --git a/mace/kernels/opencl/winograd_transform.h b/mace/kernels/opencl/winograd_transform.h
new file mode 100644
index 00000000..d706e89b
--- /dev/null
+++ b/mace/kernels/opencl/winograd_transform.h
@@ -0,0 +1,50 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_
+#define MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_
+
+#include <vector>
+
+#include "mace/public/mace.h"
+#include "mace/utils/utils.h"
+namespace mace {
+
+class OpContext;
+class Tensor;
+
+namespace kernels {
+
+class OpenCLWinogradTransformKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const Tensor *input,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradTransformKernel);
+};
+
+class OpenCLWinogradInverseTransformKernel {
+ public:
+  virtual MaceStatus Compute(
+      OpContext *context,
+      const std::vector<const Tensor*> &inputs,
+      Tensor *output) = 0;
+  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_OPENCL_WINOGRAD_TRANSFORM_H_
diff --git a/mace/kernels/ops_register.cc b/mace/kernels/ops_register.cc
new file mode 100644
index 00000000..4dba8910
--- /dev/null
+++ b/mace/kernels/ops_register.cc
@@ -0,0 +1,132 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/ops_register.h"
+
+namespace mace {
+
+namespace kernels {
+// Keep in lexicographical order
+extern void RegisterActivation(OpRegistryBase *op_registry);
+extern void RegisterAddN(OpRegistryBase *op_registry);
+extern void RegisterArgMax(OpRegistryBase *op_registry);
+extern void RegisterBatchNorm(OpRegistryBase *op_registry);
+extern void RegisterBatchToSpaceND(OpRegistryBase *op_registry);
+extern void RegisterBiasAdd(OpRegistryBase *op_registry);
+extern void RegisterCast(OpRegistryBase *op_registry);
+extern void RegisterChannelShuffle(OpRegistryBase *op_registry);
+extern void RegisterConcat(OpRegistryBase *op_registry);
+extern void RegisterConv2D(OpRegistryBase *op_registry);
+extern void RegisterCrop(OpRegistryBase *op_registry);
+extern void RegisterDeconv2D(OpRegistryBase *op_registry);
+extern void RegisterDepthToSpace(OpRegistryBase *op_registry);
+extern void RegisterDepthwiseConv2d(OpRegistryBase *op_registry);
+extern void RegisterDequantize(OpRegistryBase *op_registry);
+extern void RegisterEltwise(OpRegistryBase *op_registry);
+extern void RegisterExpandDims(OpRegistryBase *op_registry);
+extern void RegisterFill(OpRegistryBase *op_registry);
+extern void RegisterFullyConnected(OpRegistryBase *op_registry);
+extern void RegisterGather(OpRegistryBase *op_registry);
+extern void RegisterIdentity(OpRegistryBase *op_registry);
+extern void RegisterInferConv2dShape(OpRegistryBase *op_registry);
+extern void RegisterLocalResponseNorm(OpRegistryBase *op_registry);
+extern void RegisterMatMul(OpRegistryBase *op_registry);
+extern void RegisterPad(OpRegistryBase *op_registry);
+extern void RegisterPooling(OpRegistryBase *op_registry);
+extern void RegisterQuantize(OpRegistryBase *op_registry);
+extern void RegisterReduceMean(OpRegistryBase *op_registry);
+extern void RegisterReshape(OpRegistryBase *op_registry);
+extern void RegisterResizeBicubic(OpRegistryBase *op_registry);
+extern void RegisterResizeBilinear(OpRegistryBase *op_registry);
+extern void RegisterReverse(OpRegistryBase *op_registry);
+extern void RegisterScalarMath(OpRegistryBase *op_registry);
+extern void RegisterShape(OpRegistryBase *op_registry);
+extern void RegisterSoftmax(OpRegistryBase *op_registry);
+extern void RegisterSpaceToBatchND(OpRegistryBase *op_registry);
+extern void RegisterSpaceToDepth(OpRegistryBase *op_registry);
+extern void RegisterSplit(OpRegistryBase *op_registry);
+extern void RegisterSqrDiffMean(OpRegistryBase *op_registry);
+extern void RegisterSqueeze(OpRegistryBase *op_registry);
+extern void RegisterStack(OpRegistryBase *op_registry);
+extern void RegisterStridedSlice(OpRegistryBase *op_registry);
+extern void RegisterTranspose(OpRegistryBase *op_registry);
+extern void RegisterUnstack(OpRegistryBase *op_registry);
+#ifdef MACE_ENABLE_OPENCL
+extern void RegisterBufferTransform(OpRegistryBase *op_registry);
+extern void RegisterBufferInverseTransform(OpRegistryBase *op_registry);
+extern void RegisterLSTMCell(OpRegistryBase *op_registry);
+extern void RegisterWinogradInverseTransform(OpRegistryBase *op_registry);
+extern void RegisterWinogradTransform(OpRegistryBase *op_registry);
+
+#endif  // MACE_ENABLE_OPENCL
+}  // namespace kernels
+
+
+OpRegistry::OpRegistry() : OpRegistryBase() {
+  // Keep in lexicographical order
+  kernels::RegisterActivation(this);
+  kernels::RegisterAddN(this);
+  kernels::RegisterArgMax(this);
+  kernels::RegisterBatchNorm(this);
+  kernels::RegisterBatchToSpaceND(this);
+  kernels::RegisterBiasAdd(this);
+  kernels::RegisterCast(this);
+  kernels::RegisterChannelShuffle(this);
+  kernels::RegisterConcat(this);
+  kernels::RegisterConv2D(this);
+  kernels::RegisterCrop(this);
+  kernels::RegisterDeconv2D(this);
+  kernels::RegisterDepthToSpace(this);
+  kernels::RegisterDepthwiseConv2d(this);
+  kernels::RegisterDequantize(this);
+  kernels::RegisterEltwise(this);
+  kernels::RegisterExpandDims(this);
+  kernels::RegisterFill(this);
+  kernels::RegisterFullyConnected(this);
+  kernels::RegisterGather(this);
+  kernels::RegisterIdentity(this);
+  kernels::RegisterInferConv2dShape(this);
+  kernels::RegisterLocalResponseNorm(this);
+  kernels::RegisterMatMul(this);
+  kernels::RegisterPad(this);
+  kernels::RegisterPooling(this);
+  kernels::RegisterQuantize(this);
+  kernels::RegisterReduceMean(this);
+  kernels::RegisterReshape(this);
+  kernels::RegisterResizeBicubic(this);
+  kernels::RegisterResizeBilinear(this);
+  kernels::RegisterReverse(this);
+  kernels::RegisterScalarMath(this);
+  kernels::RegisterShape(this);
+  kernels::RegisterSoftmax(this);
+  kernels::RegisterSpaceToBatchND(this);
+  kernels::RegisterSpaceToDepth(this);
+  kernels::RegisterSplit(this);
+  kernels::RegisterStack(this);
+  kernels::RegisterStridedSlice(this);
+  kernels::RegisterSqrDiffMean(this);
+  kernels::RegisterSqueeze(this);
+  kernels::RegisterTranspose(this);
+  kernels::RegisterUnstack(this);
+#ifdef MACE_ENABLE_OPENCL
+  kernels::RegisterBufferTransform(this);
+  kernels::RegisterBufferInverseTransform(this);
+  kernels::RegisterLSTMCell(this);
+  kernels::RegisterWinogradInverseTransform(this);
+  kernels::RegisterWinogradTransform(this);
+
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace mace
diff --git a/mace/ops/ops_register.h b/mace/kernels/ops_register.h
similarity index 76%
rename from mace/ops/ops_register.h
rename to mace/kernels/ops_register.h
index 9369fde5..e3576adb 100644
--- a/mace/ops/ops_register.h
+++ b/mace/kernels/ops_register.h
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_OPS_REGISTER_H_
-#define MACE_OPS_OPS_REGISTER_H_
+#ifndef MACE_KERNELS_OPS_REGISTER_H_
+#define MACE_KERNELS_OPS_REGISTER_H_
 
 #include "mace/core/operator.h"
 
 namespace mace {
 
-class OperatorRegistry : public OperatorRegistryBase {
+class OpRegistry : public OpRegistryBase {
  public:
-  OperatorRegistry();
-  ~OperatorRegistry() = default;
+  OpRegistry();
+  ~OpRegistry() = default;
 };
 
 }  // namespace mace
 
-#endif  // MACE_OPS_OPS_REGISTER_H_
+#endif  // MACE_KERNELS_OPS_REGISTER_H_
diff --git a/mace/kernels/pad.cc b/mace/kernels/pad.cc
new file mode 100644
index 00000000..9024eb0f
--- /dev/null
+++ b/mace/kernels/pad.cc
@@ -0,0 +1,130 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include <memory>
+
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/pad.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, typename T>
+class PadOp;
+
+template <typename T>
+class PadOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit PadOp(OpConstructContext *context)
+      : Operation(context),
+        paddings_(Operation::GetRepeatedArgs<int>("paddings")),
+        constant_value_(Operation::GetOptionalArg<float>(
+            "constant_value", 0.0)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(
+        this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
+    auto input_shape = input->shape();
+    MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0]
+                                             + this->paddings_[1],
+                                         input_shape[1] + this->paddings_[2]
+                                             + this->paddings_[3],
+                                         input_shape[2] + this->paddings_[4]
+                                             + this->paddings_[5],
+                                         input_shape[3] + this->paddings_[6]
+                                             + this->paddings_[7]}));
+
+    Tensor::MappingGuard input_guard(input);
+    Tensor::MappingGuard output_guard(output);
+    auto input_ptr = input->data<T>();
+    T *output_ptr = output->mutable_data<T>();
+    std::fill(output_ptr, output_ptr + output->size(), this->constant_value_);
+
+    const index_t batch = input->dim(0);
+    const index_t channel = input->dim(1);
+    const index_t height = input->dim(2);
+    const index_t width = input->dim(3);
+#pragma omp parallel for collapse(3)
+    for (index_t b = 0; b < batch; ++b) {
+      for (index_t c = 0; c < channel; ++c) {
+        for (index_t h = 0; h < height; ++h) {
+          const index_t in_offset = (((b * channel + c) * height) + h) * width;
+          const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1)
+              + (c + this->paddings_[2])) * output->dim(2)
+              + (h + this->paddings_[4])) * output->dim(3)
+              + this->paddings_[6];
+          memcpy(output_ptr + out_offset,
+                 input_ptr + in_offset,
+                 width * sizeof(T));
+        }
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  std::vector<int> paddings_;
+  float constant_value_;
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class PadOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit PadOp(OpConstructContext *context)
+      : Operation(context) {
+    std::vector<int> paddings = Operation::GetRepeatedArgs<int>("paddings");
+    float constant_value = Operation::GetOptionalArg<float>(
+        "constant_value", 0.0);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::PadKernel<T>(paddings, constant_value));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    return kernel_->Compute(context, input, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLPadKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterPad(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Pad", PadOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/pad.h b/mace/kernels/pad.h
deleted file mode 100644
index 23d60bf4..00000000
--- a/mace/kernels/pad.h
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_PAD_H_
-#define MACE_KERNELS_PAD_H_
-
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-
-namespace mace {
-namespace kernels {
-
-template<DeviceType D, typename T>
-struct PadFunctor : OpKernel {
-  PadFunctor(OpKernelContext *context,
-             const std::vector<int> &paddings,
-             const float constant_value)
-      : OpKernel(context),
-        paddings_(paddings),
-        constant_value_(constant_value) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    MACE_CHECK(
-        this->paddings_.size() == static_cast<size_t>(input->dim_size()) * 2);
-    auto input_shape = input->shape();
-    MACE_RETURN_IF_ERROR(output->Resize({input_shape[0] + this->paddings_[0]
-                                            + this->paddings_[1],
-                                        input_shape[1] + this->paddings_[2]
-                                            + this->paddings_[3],
-                                        input_shape[2] + this->paddings_[4]
-                                            + this->paddings_[5],
-                                        input_shape[3] + this->paddings_[6]
-                                            + this->paddings_[7]}));
-
-    Tensor::MappingGuard input_guard(input);
-    Tensor::MappingGuard output_guard(output);
-    auto input_ptr = input->data<T>();
-    T *output_ptr = output->mutable_data<T>();
-    std::fill(output_ptr, output_ptr + output->size(), this->constant_value_);
-
-    const index_t batch = input->dim(0);
-    const index_t channel = input->dim(1);
-    const index_t height = input->dim(2);
-    const index_t width = input->dim(3);
-#pragma omp parallel for collapse(3)
-    for (index_t b = 0; b < batch; ++b) {
-      for (index_t c = 0; c < channel; ++c) {
-        for (index_t h = 0; h < height; ++h) {
-          const index_t in_offset = (((b * channel + c) * height) + h) * width;
-          const index_t out_offset = (((b + this->paddings_[0]) * output->dim(1)
-              + (c + this->paddings_[2])) * output->dim(2)
-              + (h + this->paddings_[4])) * output->dim(3)
-              + this->paddings_[6];
-          memcpy(output_ptr + out_offset,
-                 input_ptr + in_offset,
-                 width * sizeof(T));
-        }
-      }
-    }
-
-    return MACE_SUCCESS;
-  }
-
-  std::vector<int> paddings_;
-  float constant_value_;
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLPadKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPadKernel);
-};
-template <typename T>
-struct PadFunctor<DeviceType::GPU, T> : OpKernel {
-  PadFunctor(OpKernelContext *context,
-             const std::vector<int> &paddings,
-             const float constant_value);
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLPadKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_PAD_H_
diff --git a/mace/kernels/pooling.cc b/mace/kernels/pooling.cc
new file mode 100644
index 00000000..07d41d11
--- /dev/null
+++ b/mace/kernels/pooling.cc
@@ -0,0 +1,467 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(MACE_ENABLE_NEON)
+#include <arm_neon.h>
+#endif
+
+#include "mace/kernels/pooling.h"
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "mace/core/future.h"
+#include "mace/core/operator.h"
+#include "mace/core/tensor.h"
+#include "mace/kernels/conv_pool_2d_base.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/pooling.h"
+#include "mace/kernels/opencl/buffer/pooling.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+class PoolingOpBase : public ConvPool2dOpBase {
+ public:
+  explicit PoolingOpBase(OpConstructContext *context)
+      : ConvPool2dOpBase(context),
+        kernels_(Operation::GetRepeatedArgs<int>("kernels")),
+        pooling_type_(
+            static_cast<PoolingType>(Operation::GetOptionalArg<int>(
+                "pooling_type", static_cast<int>(AVG)))) {}
+
+ protected:
+  std::vector<int> kernels_;
+  PoolingType pooling_type_;
+
+  MACE_OP_INPUT_TAGS(INPUT);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
+};
+
+template <DeviceType D, class T>
+class PoolingOp;
+
+template <>
+class PoolingOp<DeviceType::CPU, float> : public PoolingOpBase {
+ public:
+  explicit PoolingOp(OpConstructContext *context)
+      : PoolingOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input_tensor = this->Input(0);
+    Tensor *output_tensor = this->Output(0);
+    std::vector<index_t> output_shape(4);
+    std::vector<index_t> filter_shape = {
+        input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]};
+
+    std::vector<int> paddings(2);
+    if (paddings_.empty()) {
+      kernels::CalcNCHWPaddingAndOutputSize(
+          input_tensor->shape().data(), filter_shape.data(), dilations_.data(),
+          strides_.data(), padding_type_, output_shape.data(), paddings.data());
+    } else {
+      paddings = paddings_;
+      CalcNCHWOutputSize(input_tensor->shape().data(),
+                         filter_shape.data(),
+                         paddings_.data(),
+                         dilations_.data(),
+                         strides_.data(),
+                         RoundType::CEIL,
+                         output_shape.data());
+    }
+    MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
+
+    Tensor::MappingGuard input_guard(input_tensor);
+    Tensor::MappingGuard output_guard(output_tensor);
+    const float *input = input_tensor->data<float>();
+    float *output = output_tensor->mutable_data<float>();
+    const index_t *input_shape = input_tensor->shape().data();
+    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
+
+    if (pooling_type_ == PoolingType::MAX) {
+      MaxPooling(input,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_.data(),
+                 strides_.data(),
+                 dilations_.data(),
+                 pad_hw,
+                 output);
+    } else if (pooling_type_ == PoolingType::AVG) {
+      AvgPooling(input,
+                 input_shape,
+                 output_shape.data(),
+                 kernels_.data(),
+                 strides_.data(),
+                 dilations_.data(),
+                 pad_hw,
+                 output);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  void MaxPooling(const float *input,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
+                  float *output) {
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;
+
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
+        const index_t out_base = b * out_batch_size + c * out_image_size;
+        const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+
+        for (index_t h = 0; h < out_height; ++h) {
+          for (index_t w = 0; w < out_width; ++w) {
+            const index_t out_offset = out_base + h * out_width + w;
+            float res = std::numeric_limits<float>::lowest();
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
+                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
+                  index_t input_offset = in_base + inh * in_width + inw;
+                  res = std::max(res, input[input_offset]);
+                }
+              }
+            }
+            output[out_offset] = res;
+          }
+        }
+      }
+    }
+  }
+
+  void AvgPooling(const float *input,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *dilation_hw,
+                  const int *pad_hw,
+                  float *output) {
+    const index_t in_image_size = in_shape[2] * in_shape[3];
+    const index_t out_image_size = out_shape[2] * out_shape[3];
+    const index_t in_batch_size = in_shape[1] * in_image_size;
+    const index_t out_batch_size = out_shape[1] * out_image_size;
+
+#pragma omp parallel for collapse(2)
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t c = 0; c < out_shape[1]; ++c) {
+        const index_t out_base = b * out_batch_size + c * out_image_size;
+        const index_t in_base = b * in_batch_size + c * in_image_size;
+        const index_t in_height = in_shape[2];
+        const index_t in_width = in_shape[3];
+        const index_t out_height = out_shape[2];
+        const index_t out_width = out_shape[3];
+        for (index_t h = 0; h < out_height; ++h) {
+          for (index_t w = 0; w < out_width; ++w) {
+            const index_t out_offset = out_base + h * out_width + w;
+            float res = 0;
+            int block_size = 0;
+            for (int fh = 0; fh < filter_hw[0]; ++fh) {
+              for (int fw = 0; fw < filter_hw[1]; ++fw) {
+                index_t inh =
+                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
+                index_t inw =
+                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
+                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
+                  index_t input_offset = in_base + inh * in_width + inw;
+                  res += input[input_offset];
+                  ++block_size;
+                }
+              }
+            }
+            output[out_offset] = res / block_size;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <>
+class PoolingOp<DeviceType::CPU, uint8_t> : public PoolingOpBase {
+ public:
+  explicit PoolingOp(OpConstructContext *context)
+      : PoolingOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input_tensor = this->Input(0);
+    Tensor *output_tensor = this->Output(0);
+    MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1,
+               "Quantized pooling does not support dilation > 1 yet.");
+    // Use the same scale and zero point with input and output.
+    output_tensor->SetScale(input_tensor->scale());
+    output_tensor->SetZeroPoint(input_tensor->zero_point());
+
+    std::vector<index_t> output_shape(4);
+    std::vector<index_t> filter_shape = {
+        input_tensor->dim(3), kernels_[0], kernels_[1], input_tensor->dim(3)};
+
+    std::vector<int> paddings(2);
+    if (paddings_.empty()) {
+      CalcPaddingAndOutputSize(input_tensor->shape().data(),
+                               NHWC,
+                               filter_shape.data(),
+                               OHWI,
+                               dilations_.data(),
+                               strides_.data(),
+                               padding_type_,
+                               output_shape.data(),
+                               paddings.data());
+    } else {
+      paddings = paddings_;
+      CalcOutputSize(input_tensor->shape().data(),
+                     NHWC,
+                     filter_shape.data(),
+                     OHWI,
+                     paddings_.data(),
+                     dilations_.data(),
+                     strides_.data(),
+                     RoundType::CEIL,
+                     output_shape.data());
+    }
+    MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
+
+    const index_t out_channels = output_tensor->dim(3);
+    const index_t in_channels = input_tensor->dim(3);
+    MACE_CHECK(out_channels == in_channels);
+
+    Tensor::MappingGuard input_guard(input_tensor);
+    Tensor::MappingGuard output_guard(output_tensor);
+    const uint8_t *input = input_tensor->data<uint8_t>();
+    uint8_t *output = output_tensor->mutable_data<uint8_t>();
+    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
+
+    if (pooling_type_ == PoolingType::MAX) {
+      MaxPooling(input,
+                 input_tensor->shape().data(),
+                 output_shape.data(),
+                 kernels_.data(),
+                 strides_.data(),
+                 pad_hw,
+                 output);
+    } else if (pooling_type_ == PoolingType::AVG) {
+      AvgPooling(input,
+                 input_tensor->shape().data(),
+                 output_shape.data(),
+                 kernels_.data(),
+                 strides_.data(),
+                 pad_hw,
+                 output);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  void MaxPooling(const uint8_t *input,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *pad_hw,
+                  uint8_t *output) {
+#pragma omp parallel for collapse(3)
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t h = 0; h < out_shape[1]; ++h) {
+        for (index_t w = 0; w < out_shape[2]; ++w) {
+          const index_t out_height = out_shape[1];
+          const index_t out_width = out_shape[2];
+          const index_t channels = out_shape[3];
+          const index_t in_height = in_shape[1];
+          const index_t in_width = in_shape[2];
+          const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
+          const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
+          const index_t in_h_begin = std::max<index_t>(0, in_h_base);
+          const index_t in_w_begin = std::max<index_t>(0, in_w_base);
+          const index_t in_h_end =
+              std::min(in_height, in_h_base + filter_hw[0]);
+          const index_t in_w_end =
+              std::min(in_width, in_w_base + filter_hw[1]);
+
+          uint8_t *out_ptr =
+              output + ((b * out_height + h) * out_width + w) * channels;
+          std::fill_n(out_ptr, channels, 0);
+          for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
+            for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
+              const uint8_t *in_ptr = input +
+                  ((b * in_height + ih) * in_width + iw) * channels;
+              index_t c = 0;
+#if defined(MACE_ENABLE_NEON)
+              for (; c <= channels - 16; c += 16) {
+                uint8x16_t out_vec = vld1q_u8(out_ptr + c);
+                uint8x16_t in_vec = vld1q_u8(in_ptr + c);
+                out_vec = vmaxq_u8(out_vec, in_vec);
+                vst1q_u8(out_ptr + c, out_vec);
+              }
+              for (; c <= channels - 8; c += 8) {
+                uint8x8_t out_vec = vld1_u8(out_ptr + c);
+                uint8x8_t in_vec = vld1_u8(in_ptr + c);
+                out_vec = vmax_u8(out_vec, in_vec);
+                vst1_u8(out_ptr + c, out_vec);
+              }
+#endif
+              for (; c < channels; ++c) {
+                out_ptr[c] = std::max(out_ptr[c], in_ptr[c]);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void AvgPooling(const uint8_t *input,
+                  const index_t *in_shape,
+                  const index_t *out_shape,
+                  const int *filter_hw,
+                  const int *stride_hw,
+                  const int *pad_hw,
+                  uint8_t *output) {
+#pragma omp parallel for collapse(3)
+    for (index_t b = 0; b < out_shape[0]; ++b) {
+      for (index_t h = 0; h < out_shape[1]; ++h) {
+        for (index_t w = 0; w < out_shape[2]; ++w) {
+          const index_t out_height = out_shape[1];
+          const index_t out_width = out_shape[2];
+          const index_t channels = out_shape[3];
+          const index_t in_height = in_shape[1];
+          const index_t in_width = in_shape[2];
+          const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
+          const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
+          const index_t in_h_begin = std::max<index_t>(0, in_h_base);
+          const index_t in_w_begin = std::max<index_t>(0, in_w_base);
+          const index_t in_h_end =
+              std::min(in_height, in_h_base + filter_hw[0]);
+          const index_t in_w_end =
+              std::min(in_width, in_w_base + filter_hw[1]);
+          const index_t block_size =
+              (in_h_end - in_h_begin) * (in_w_end - in_w_begin);
+          MACE_CHECK(block_size > 0);
+
+          std::vector<uint16_t> average_buffer(channels);
+          uint16_t *avg_buffer = average_buffer.data();
+          std::fill_n(avg_buffer, channels, 0);
+          for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
+            for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
+              const uint8_t *in_ptr = input +
+                  ((b * in_height + ih) * in_width + iw) * channels;
+              index_t c = 0;
+#if defined(MACE_ENABLE_NEON)
+              for (; c <= channels - 16; c += 16) {
+                uint16x8_t avg_vec[2];
+                avg_vec[0] = vld1q_u16(avg_buffer + c);
+                avg_vec[1] = vld1q_u16(avg_buffer + c + 8);
+                uint8x16_t in_vec = vld1q_u8(in_ptr + c);
+                avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec));
+                avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec));
+                vst1q_u16(avg_buffer + c, avg_vec[0]);
+                vst1q_u16(avg_buffer + c + 8, avg_vec[1]);
+              }
+              for (; c <= channels - 8; c += 8) {
+                uint16x8_t avg_vec = vld1q_u16(avg_buffer + c);
+                uint8x8_t in_vec = vld1_u8(in_ptr + c);
+                avg_vec = vaddw_u8(avg_vec, in_vec);
+                vst1q_u16(avg_buffer + c, avg_vec);
+              }
+#endif
+              for (; c < channels; ++c) {
+                avg_buffer[c] += in_ptr[c];
+              }
+            }
+          }
+          uint8_t *out_ptr =
+              output + ((b * out_height + h) * out_width + w) * channels;
+          for (index_t c = 0; c < channels; ++c) {
+            out_ptr[c] = static_cast<uint8_t>(
+                (avg_buffer[c] + block_size / 2) / block_size);
+          }
+        }
+      }
+    }
+  }
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class PoolingOp<DeviceType::GPU, T> : public PoolingOpBase {
+ public:
+  explicit PoolingOp(OpConstructContext *context)
+      : PoolingOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::PoolingKernel<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::PoolingKernel<T>);
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    return kernel_->Compute(context, input, pooling_type_, kernels_.data(),
+                            strides_.data(), padding_type_, paddings_,
+                            dilations_.data(), output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLPoolingKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterPooling(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
+                   DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Pooling", PoolingOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 000fa269..9780907c 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -15,476 +15,12 @@
 #ifndef MACE_KERNELS_POOLING_H_
 #define MACE_KERNELS_POOLING_H_
 
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-#include "mace/kernels/kernel.h"
-
-#if defined(MACE_ENABLE_NEON)
-#include <arm_neon.h>
-#endif
 
 namespace mace {
-
 enum PoolingType {
   AVG = 1,  // avg_pool
   MAX = 2,  // max_pool
 };
-
-namespace kernels {
-
-struct PoolingFunctorBase : OpKernel {
-  PoolingFunctorBase(OpKernelContext *context,
-                     const PoolingType pooling_type,
-                     const int *kernels,
-                     const int *strides,
-                     const Padding padding_type,
-                     const std::vector<int> &paddings,
-                     const int *dilations)
-      : OpKernel(context),
-        pooling_type_(pooling_type),
-        kernels_(kernels),
-        strides_(strides),
-        padding_type_(padding_type),
-        paddings_(paddings),
-        dilations_(dilations) {}
-
-  const PoolingType pooling_type_;
-  const int *kernels_;
-  const int *strides_;
-  const Padding padding_type_;
-  std::vector<int> paddings_;
-  const int *dilations_;
-};
-
-template <DeviceType D, typename T>
-struct PoolingFunctor;
-
-template <>
-struct PoolingFunctor<DeviceType::CPU, float>: PoolingFunctorBase {
-  PoolingFunctor(OpKernelContext *context,
-                 const PoolingType pooling_type,
-                 const int *kernels,
-                 const int *strides,
-                 const Padding padding_type,
-                 const std::vector<int> &paddings,
-                 const int *dilations)
-      : PoolingFunctorBase(context,
-                           pooling_type,
-                           kernels,
-                           strides,
-                           padding_type,
-                           paddings,
-                           dilations) {}
-
-  void MaxPooling(const float *input,
-                  const index_t *in_shape,
-                  const index_t *out_shape,
-                  const int *filter_hw,
-                  const int *stride_hw,
-                  const int *dilation_hw,
-                  const int *pad_hw,
-                  float *output) {
-    const index_t in_image_size = in_shape[2] * in_shape[3];
-    const index_t out_image_size = out_shape[2] * out_shape[3];
-    const index_t in_batch_size = in_shape[1] * in_image_size;
-    const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t c = 0; c < out_shape[1]; ++c) {
-        const index_t out_base = b * out_batch_size + c * out_image_size;
-        const index_t in_base = b * in_batch_size + c * in_image_size;
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        const index_t in_height = in_shape[2];
-        const index_t in_width = in_shape[3];
-
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
-            const index_t out_offset = out_base + h * out_width + w;
-            float res = std::numeric_limits<float>::lowest();
-            for (int fh = 0; fh < filter_hw[0]; ++fh) {
-              for (int fw = 0; fw < filter_hw[1]; ++fw) {
-                index_t inh =
-                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
-                index_t inw =
-                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
-                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
-                  index_t input_offset = in_base + inh * in_width + inw;
-                  res = std::max(res, input[input_offset]);
-                }
-              }
-            }
-            output[out_offset] = res;
-          }
-        }
-      }
-    }
-  }
-
-  void AvgPooling(const float *input,
-                  const index_t *in_shape,
-                  const index_t *out_shape,
-                  const int *filter_hw,
-                  const int *stride_hw,
-                  const int *dilation_hw,
-                  const int *pad_hw,
-                  float *output) {
-    const index_t in_image_size = in_shape[2] * in_shape[3];
-    const index_t out_image_size = out_shape[2] * out_shape[3];
-    const index_t in_batch_size = in_shape[1] * in_image_size;
-    const index_t out_batch_size = out_shape[1] * out_image_size;
-
-#pragma omp parallel for collapse(2)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t c = 0; c < out_shape[1]; ++c) {
-        const index_t out_base = b * out_batch_size + c * out_image_size;
-        const index_t in_base = b * in_batch_size + c * in_image_size;
-        const index_t in_height = in_shape[2];
-        const index_t in_width = in_shape[3];
-        const index_t out_height = out_shape[2];
-        const index_t out_width = out_shape[3];
-        for (index_t h = 0; h < out_height; ++h) {
-          for (index_t w = 0; w < out_width; ++w) {
-            const index_t out_offset = out_base + h * out_width + w;
-            float res = 0;
-            int block_size = 0;
-            for (int fh = 0; fh < filter_hw[0]; ++fh) {
-              for (int fw = 0; fw < filter_hw[1]; ++fw) {
-                index_t inh =
-                    h * stride_hw[0] + dilation_hw[0] * fh - pad_hw[0];
-                index_t inw =
-                    w * stride_hw[1] + dilation_hw[1] * fw - pad_hw[1];
-                if (inh >= 0 && inh < in_height && inw >= 0 && inw < in_width) {
-                  index_t input_offset = in_base + inh * in_width + inw;
-                  res += input[input_offset];
-                  ++block_size;
-                }
-              }
-            }
-            output[out_offset] = res / block_size;
-          }
-        }
-      }
-    }
-  }
-
-  MaceStatus operator()(const Tensor *input_tensor,  // NCHW
-                        Tensor *output_tensor,       // NCHW
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    std::vector<index_t> output_shape(4);
-    std::vector<index_t> filter_shape = {
-      input_tensor->dim(1), input_tensor->dim(1), kernels_[0], kernels_[1]};
-
-    std::vector<int> paddings(2);
-    if (paddings_.empty()) {
-      kernels::CalcNCHWPaddingAndOutputSize(
-        input_tensor->shape().data(), filter_shape.data(), dilations_,
-        strides_, padding_type_, output_shape.data(), paddings.data());
-    } else {
-      paddings = paddings_;
-      CalcNCHWOutputSize(input_tensor->shape().data(),
-                         filter_shape.data(),
-                         paddings_.data(),
-                         dilations_,
-                         strides_,
-                         RoundType::CEIL,
-                         output_shape.data());
-    }
-    MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
-
-    Tensor::MappingGuard input_guard(input_tensor);
-    Tensor::MappingGuard output_guard(output_tensor);
-    const float *input = input_tensor->data<float>();
-    float *output = output_tensor->mutable_data<float>();
-    const index_t *input_shape = input_tensor->shape().data();
-    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
-
-    if (pooling_type_ == PoolingType::MAX) {
-      MaxPooling(input,
-                 input_shape,
-                 output_shape.data(),
-                 kernels_,
-                 strides_,
-                 dilations_,
-                 pad_hw,
-                 output);
-    } else if (pooling_type_ == PoolingType::AVG) {
-      AvgPooling(input,
-                 input_shape,
-                 output_shape.data(),
-                 kernels_,
-                 strides_,
-                 dilations_,
-                 pad_hw,
-                 output);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-
-    return MACE_SUCCESS;
-  }
-};
-
-template <>
-struct PoolingFunctor<DeviceType::CPU, uint8_t>: PoolingFunctorBase {
-  PoolingFunctor(OpKernelContext *context,
-                 const PoolingType pooling_type,
-                 const int *kernels,
-                 const int *strides,
-                 const Padding padding_type,
-                 const std::vector<int> &paddings,
-                 const int *dilations)
-      : PoolingFunctorBase(context,
-                           pooling_type,
-                           kernels,
-                           strides,
-                           padding_type,
-                           paddings,
-                           dilations) {}
-
-  void MaxPooling(const uint8_t *input,
-                  const index_t *in_shape,
-                  const index_t *out_shape,
-                  const int *filter_hw,
-                  const int *stride_hw,
-                  const int *pad_hw,
-                  uint8_t *output) {
-#pragma omp parallel for collapse(3)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          const index_t out_height = out_shape[1];
-          const index_t out_width = out_shape[2];
-          const index_t channels = out_shape[3];
-          const index_t in_height = in_shape[1];
-          const index_t in_width = in_shape[2];
-          const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
-          const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
-          const index_t in_h_begin = std::max<index_t>(0, in_h_base);
-          const index_t in_w_begin = std::max<index_t>(0, in_w_base);
-          const index_t in_h_end =
-              std::min(in_height, in_h_base + filter_hw[0]);
-          const index_t in_w_end =
-              std::min(in_width, in_w_base + filter_hw[1]);
-
-          uint8_t *out_ptr =
-              output + ((b * out_height + h) * out_width + w) * channels;
-          std::fill_n(out_ptr, channels, 0);
-          for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
-            for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
-              const uint8_t *in_ptr = input +
-                  ((b * in_height + ih) * in_width + iw) * channels;
-              index_t c = 0;
-#if defined(MACE_ENABLE_NEON)
-              for (; c <= channels - 16; c += 16) {
-                uint8x16_t out_vec = vld1q_u8(out_ptr + c);
-                uint8x16_t in_vec = vld1q_u8(in_ptr + c);
-                out_vec = vmaxq_u8(out_vec, in_vec);
-                vst1q_u8(out_ptr + c, out_vec);
-              }
-              for (; c <= channels - 8; c += 8) {
-                uint8x8_t out_vec = vld1_u8(out_ptr + c);
-                uint8x8_t in_vec = vld1_u8(in_ptr + c);
-                out_vec = vmax_u8(out_vec, in_vec);
-                vst1_u8(out_ptr + c, out_vec);
-              }
-#endif
-              for (; c < channels; ++c) {
-                out_ptr[c] = std::max(out_ptr[c], in_ptr[c]);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  void AvgPooling(const uint8_t *input,
-                  const index_t *in_shape,
-                  const index_t *out_shape,
-                  const int *filter_hw,
-                  const int *stride_hw,
-                  const int *pad_hw,
-                  uint8_t *output) {
-#pragma omp parallel for collapse(3)
-    for (index_t b = 0; b < out_shape[0]; ++b) {
-      for (index_t h = 0; h < out_shape[1]; ++h) {
-        for (index_t w = 0; w < out_shape[2]; ++w) {
-          const index_t out_height = out_shape[1];
-          const index_t out_width = out_shape[2];
-          const index_t channels = out_shape[3];
-          const index_t in_height = in_shape[1];
-          const index_t in_width = in_shape[2];
-          const index_t in_h_base = h * stride_hw[0] - pad_hw[0];
-          const index_t in_w_base = w * stride_hw[1] - pad_hw[1];
-          const index_t in_h_begin = std::max<index_t>(0, in_h_base);
-          const index_t in_w_begin = std::max<index_t>(0, in_w_base);
-          const index_t in_h_end =
-              std::min(in_height, in_h_base + filter_hw[0]);
-          const index_t in_w_end =
-              std::min(in_width, in_w_base + filter_hw[1]);
-          const index_t block_size =
-              (in_h_end - in_h_begin) * (in_w_end - in_w_begin);
-          MACE_CHECK(block_size > 0);
-
-          std::vector<uint16_t> average_buffer(channels);
-          uint16_t *avg_buffer = average_buffer.data();
-          std::fill_n(avg_buffer, channels, 0);
-          for (index_t ih = in_h_begin; ih < in_h_end; ++ih) {
-            for (index_t iw = in_w_begin; iw < in_w_end; ++iw) {
-              const uint8_t *in_ptr = input +
-                  ((b * in_height + ih) * in_width + iw) * channels;
-              index_t c = 0;
-#if defined(MACE_ENABLE_NEON)
-              for (; c <= channels - 16; c += 16) {
-                uint16x8_t avg_vec[2];
-                avg_vec[0] = vld1q_u16(avg_buffer + c);
-                avg_vec[1] = vld1q_u16(avg_buffer + c + 8);
-                uint8x16_t in_vec = vld1q_u8(in_ptr + c);
-                avg_vec[0] = vaddw_u8(avg_vec[0], vget_low_u8(in_vec));
-                avg_vec[1] = vaddw_u8(avg_vec[1], vget_high_u8(in_vec));
-                vst1q_u16(avg_buffer + c, avg_vec[0]);
-                vst1q_u16(avg_buffer + c + 8, avg_vec[1]);
-              }
-              for (; c <= channels - 8; c += 8) {
-                uint16x8_t avg_vec = vld1q_u16(avg_buffer + c);
-                uint8x8_t in_vec = vld1_u8(in_ptr + c);
-                avg_vec = vaddw_u8(avg_vec, in_vec);
-                vst1q_u16(avg_buffer + c, avg_vec);
-              }
-#endif
-              for (; c < channels; ++c) {
-                avg_buffer[c] += in_ptr[c];
-              }
-            }
-          }
-          uint8_t *out_ptr =
-              output + ((b * out_height + h) * out_width + w) * channels;
-          for (index_t c = 0; c < channels; ++c) {
-            out_ptr[c] = static_cast<uint8_t>(
-                (avg_buffer[c] + block_size / 2) / block_size);
-          }
-        }
-      }
-    }
-  }
-
-  MaceStatus operator()(const Tensor *input_tensor,  // NHWC
-                        Tensor *output_tensor,       // NHWC
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    MACE_CHECK(dilations_[0] == 1 && dilations_[1] == 1,
-               "Quantized pooling does not support dilation > 1 yet.");
-    // Use the same scale and zero point with input and output.
-    output_tensor->SetScale(input_tensor->scale());
-    output_tensor->SetZeroPoint(input_tensor->zero_point());
-
-    std::vector<index_t> output_shape(4);
-    std::vector<index_t> filter_shape = {
-        input_tensor->dim(3), kernels_[0], kernels_[1], input_tensor->dim(3)};
-
-    std::vector<int> paddings(2);
-    if (paddings_.empty()) {
-      CalcPaddingAndOutputSize(input_tensor->shape().data(),
-                               NHWC,
-                               filter_shape.data(),
-                               OHWI,
-                               dilations_,
-                               strides_,
-                               padding_type_,
-                               output_shape.data(),
-                               paddings.data());
-    } else {
-      paddings = paddings_;
-      CalcOutputSize(input_tensor->shape().data(),
-                     NHWC,
-                     filter_shape.data(),
-                     OHWI,
-                     paddings_.data(),
-                     dilations_,
-                     strides_,
-                     RoundType::CEIL,
-                     output_shape.data());
-    }
-    MACE_RETURN_IF_ERROR(output_tensor->Resize(output_shape));
-
-    const index_t out_channels = output_tensor->dim(3);
-    const index_t in_channels = input_tensor->dim(3);
-    MACE_CHECK(out_channels == in_channels);
-
-    Tensor::MappingGuard input_guard(input_tensor);
-    Tensor::MappingGuard output_guard(output_tensor);
-    const uint8_t *input = input_tensor->data<uint8_t>();
-    uint8_t *output = output_tensor->mutable_data<uint8_t>();
-    int pad_hw[2] = {paddings[0] / 2, paddings[1] / 2};
-
-    if (pooling_type_ == PoolingType::MAX) {
-      MaxPooling(input,
-                 input_tensor->shape().data(),
-                 output_shape.data(),
-                 kernels_,
-                 strides_,
-                 pad_hw,
-                 output);
-    } else if (pooling_type_ == PoolingType::AVG) {
-      AvgPooling(input,
-                 input_tensor->shape().data(),
-                 output_shape.data(),
-                 kernels_,
-                 strides_,
-                 pad_hw,
-                 output);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-
-    return MACE_SUCCESS;
-  }
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLPoolingKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const PoolingType pooling_type,
-      const int *kernels,
-      const int *strides,
-      const Padding &padding_type,
-      const std::vector<int> &padding_data,
-      const int *dilations,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLPoolingKernel);
-};
-template <typename T>
-struct PoolingFunctor<DeviceType::GPU, T> : PoolingFunctorBase {
-  PoolingFunctor(OpKernelContext *context,
-                 const PoolingType pooling_type,
-                 const int *kernels,
-                 const int *strides,
-                 const Padding padding_type,
-                 const std::vector<int> &paddings,
-                 const int *dilations);
-
-  MaceStatus operator()(const Tensor *input_tensor,
-                        Tensor *output_tensor,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLPoolingKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
-}  // namespace kernels
 }  // namespace mace
 
 #endif  // MACE_KERNELS_POOLING_H_
diff --git a/mace/kernels/proposal.h b/mace/kernels/proposal.h
deleted file mode 100644
index aa002988..00000000
--- a/mace/kernels/proposal.h
+++ /dev/null
@@ -1,301 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_PROPOSAL_H_
-#define MACE_KERNELS_PROPOSAL_H_
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
-
-namespace mace {
-namespace kernels {
-
-inline std::vector<float> WHCenters(const std::vector<float> &anchor) {
-  // width, height, width_center, height_center
-  std::vector<float> window(4);
-  window[0] = anchor[2] - anchor[0] + 1;
-  window[1] = anchor[3] - anchor[1] + 1;
-  window[2] = anchor[0] + (window[0] - 1) / 2;
-  window[3] = anchor[1] + (window[1] - 1) / 2;
-  return window;
-}
-
-inline std::vector<std::vector<float>> GenerateAnchors(
-    const std::vector<int> &scales,
-    const std::vector<float> &ratios,
-    const int base_size) {
-  const std::vector<float> base_anchor =
-      {0, 0,
-       static_cast<float>(base_size-1),
-       static_cast<float>(base_size-1)};
-
-  const size_t scales_size = scales.size();
-  const size_t ratios_size = ratios.size();
-  // get height, width, centers
-  std::vector<float> base_window = WHCenters(base_anchor);
-  const float size = base_window[0] * base_window[1];
-  std::vector<std::vector<float>> anchors(scales_size * ratios_size,
-                                          std::vector<float>(4));
-
-#pragma omp parallel for
-  for (size_t ratio_idx = 0; ratio_idx < ratios_size; ++ratio_idx) {
-    float ws = ::roundf(::sqrtf(size / ratios[ratio_idx]));
-    float hs = ::roundf(ws * ratios[ratio_idx]);
-    std::vector<float> tmp_anchor(4);
-    tmp_anchor[0] = base_window[2] - (ws - 1) / 2;
-    tmp_anchor[1] = base_window[3] - (hs - 1) / 2;
-    tmp_anchor[2] = base_window[2] + (ws - 1) / 2;
-    tmp_anchor[3] = base_window[3] + (hs - 1) / 2;
-    auto window = WHCenters(tmp_anchor);
-    for (size_t scale_idx = 0; scale_idx < scales_size; ++scale_idx) {
-      const size_t idx = ratio_idx * scales_size + scale_idx;
-      ws = window[0] * scales[scale_idx];
-      hs = window[1] * scales[scale_idx];
-      anchors[idx][0] = window[2] - (ws - 1) / 2;
-      anchors[idx][1] = window[3] - (hs - 1) / 2;
-      anchors[idx][2] = window[2] + (ws - 1) / 2;
-      anchors[idx][3] = window[3] + (hs - 1) / 2;
-    }
-  }
-  return anchors;
-}
-
-inline std::vector<int> nms(const float *bboxes_ptr,
-                            const index_t num_bboxes,
-                            const float thresh,
-                            const int post_nms_top_n) {
-  std::vector<int> keep;
-  std::vector<int> suppressed(num_bboxes, 0);
-
-  std::vector<float> areas(num_bboxes, 0);
-  for (index_t i = 0; i < num_bboxes; ++i) {
-    const index_t idx = (i << 2);
-    areas[i] = (bboxes_ptr[idx + 2] - bboxes_ptr[idx] + 1) *
-        (bboxes_ptr[idx + 3] - bboxes_ptr[idx + 1] + 1);
-  }
-
-  for (int i = 0; i < num_bboxes; ++i) {
-    if (suppressed[i] == 1) continue;
-    keep.push_back(i);
-    if (keep.size() >= static_cast<size_t>(post_nms_top_n)) break;
-    int coord_idx = i << 2;
-    const float x1 = bboxes_ptr[coord_idx];
-    const float y1 = bboxes_ptr[coord_idx + 1];
-    const float x2 = bboxes_ptr[coord_idx + 2];
-    const float y2 = bboxes_ptr[coord_idx + 3];
-    const float area1 = areas[i];
-    for (int j = i + 1; j < num_bboxes; ++j) {
-      if (suppressed[j] == 1) continue;
-
-      coord_idx = j << 2;
-      const float iou =
-          std::max<float>(0.0,
-              std::min(x2, bboxes_ptr[coord_idx + 2]) -
-              std::max(x1, bboxes_ptr[coord_idx]) + 1)
-          * std::max<float>(0.0,
-              std::min(y2, bboxes_ptr[coord_idx + 3]) -
-              std::max(y1, bboxes_ptr[coord_idx + 1]) + 1);
-      if ((iou / (area1 + areas[j] - iou)) >= thresh) {
-        suppressed[j] = 1;
-      }
-    }
-  }
-  return keep;
-}
-
-
-template<DeviceType D, typename T>
-struct ProposalFunctor : OpKernel {
-  ProposalFunctor(OpKernelContext *context,
-                  const int min_size,
-                  const float nms_thresh,
-                  const int pre_nms_top_n,
-                  const int post_nms_top_n,
-                  const int feat_stride,
-                  const int base_size,
-                  const std::vector<int> &scales,
-                  const std::vector<float> &ratios) :
-      OpKernel(context),
-      min_size_(min_size),
-      thresh_(nms_thresh),
-      pre_nms_top_n_(pre_nms_top_n),
-      post_nms_top_n_(post_nms_top_n),
-      feat_stride_(feat_stride),
-      anchors_(GenerateAnchors(scales, ratios, base_size)) {}
-
-  MaceStatus operator()(const Tensor *rpn_cls_prob,
-                        const Tensor *rpn_bbox_pred,
-                        const Tensor *img_info_tensor,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    MACE_CHECK(rpn_cls_prob->dim(1) == rpn_bbox_pred->dim(1) &&
-        rpn_cls_prob->dim(2) == rpn_bbox_pred->dim(2));
-    MACE_CHECK((rpn_cls_prob->dim(3) / 2 == rpn_bbox_pred->dim(3) / 4) &&
-        (static_cast<size_t>(rpn_cls_prob->dim(3) / 2) == anchors_.size()));
-    const float *img_info = img_info_tensor->data<float>();
-    const int im_height = static_cast<int>(img_info[0] - 1);
-    const int im_width = static_cast<int>(img_info[1] - 1);
-    const index_t feat_height = rpn_cls_prob->dim(1);
-    const index_t feat_width = rpn_cls_prob->dim(2);
-    const int anchors_size = anchors_.size();
-
-    // shift anchors to original input
-    std::vector<std::vector<float>> proposals(
-        anchors_size * feat_height * feat_width,
-        std::vector<float>(4));
-
-#pragma omp parallel for collapse(3)
-    for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
-      for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
-        for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
-          const int shift_h = h_idx * feat_stride_;
-          const int shift_w = w_idx * feat_stride_;
-          const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
-              + a_idx;
-          proposals[sanc_idx][0] = anchors_[a_idx][0] + shift_w;
-          proposals[sanc_idx][1] = anchors_[a_idx][1] + shift_h;
-          proposals[sanc_idx][2] = anchors_[a_idx][2] + shift_w;
-          proposals[sanc_idx][3] = anchors_[a_idx][3] + shift_h;
-        }
-      }
-    }
-    // Convert anchors into proposals via bbox transformations
-    // 2. clip predicted boxes to image
-    const float *bbox_deltas = rpn_bbox_pred->data<float>();
-#pragma omp parallel for collapse(3)
-    for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
-      for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
-        for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
-          const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
-              + a_idx;
-          const float width = proposals[sanc_idx][2] -
-              proposals[sanc_idx][0] + 1;
-          const float height = proposals[sanc_idx][3] -
-              proposals[sanc_idx][1] + 1;
-          int delta_offset = sanc_idx * 4;
-          float pred_ctr_x = bbox_deltas[delta_offset + 0] * width +
-              (proposals[sanc_idx][0] + width / 2);
-          float pred_ctr_y = bbox_deltas[delta_offset + 1] * height +
-              (proposals[sanc_idx][1] + height / 2);
-          float pred_w = std::exp(bbox_deltas[delta_offset + 2]) * width;
-          float pred_h = std::exp(bbox_deltas[delta_offset + 3]) * height;
-
-          proposals[sanc_idx][0] = std::max<float>(
-              std::min<float>(pred_ctr_x - pred_w / 2, im_width),
-              0);
-          proposals[sanc_idx][1] = std::max<float>(
-              std::min<float>(pred_ctr_y - pred_h / 2, im_height),
-              0);
-          proposals[sanc_idx][2] = std::max<float>(
-              std::min<float>(pred_ctr_x + pred_w / 2, im_width),
-              0);
-          proposals[sanc_idx][3] = std::max<float>(
-              std::min<float>(pred_ctr_y + pred_h / 2, im_height),
-              0);
-        }
-      }
-    }
-    // 3. remove predicted boxes with either height or width < threshold
-    // (NOTE: convert min_size to input image scale stored in im_info[2])
-    std::vector<int> keep;
-    const float min_size = min_size_ * img_info[2];
-    for (int h_idx = 0; h_idx < feat_height; ++h_idx) {
-      for (int w_idx = 0; w_idx < feat_width; ++w_idx) {
-        for (int a_idx = 0; a_idx < anchors_size; ++a_idx) {
-          const index_t sanc_idx = (h_idx * feat_width + w_idx) * anchors_size
-              + a_idx;
-          const float width = proposals[sanc_idx][2]
-              - proposals[sanc_idx][0] + 1;
-          const float height = proposals[sanc_idx][3]
-              - proposals[sanc_idx][1] + 1;
-          if (width >= min_size && height >= min_size) {
-            keep.push_back(sanc_idx);
-          }
-        }
-      }
-    }
-
-    // 4. sort all (proposal, score) pairs by score from highest to lowest
-    // 5. take top pre_nms_topN (e.g. 6000)
-    auto scores = rpn_cls_prob->data<float>();
-    const int scores_chan = static_cast<int>(rpn_cls_prob->dim(3));
-
-    auto score_idx_func = [&](int idx) -> int {
-      return (idx / anchors_size) * scores_chan +
-          (idx % anchors_size) + anchors_size;
-    };
-    std::sort(keep.begin(), keep.end(), [&](int left, int right) -> bool{
-      return scores[score_idx_func(left)] >
-          scores[score_idx_func(right)];
-    });
-
-    int size = std::min<int>(pre_nms_top_n_, keep.size());
-    std::vector<float> nms_scores(size, 0);
-    std::vector<float> nms_proposals((size << 2), 0);
-#pragma omp parallel for
-    for (int i = 0; i < size; ++i) {
-      nms_scores[i] = scores[score_idx_func(keep[i])];
-      nms_proposals[i << 2] = proposals[keep[i]][0];
-      nms_proposals[(i << 2) + 1] = proposals[keep[i]][1];
-      nms_proposals[(i << 2) + 2] = proposals[keep[i]][2];
-      nms_proposals[(i << 2) + 3] = proposals[keep[i]][3];
-    }
-
-    /* 6. apply nms (e.g. threshold = 0.7)
-       7. take after_nms_topN (e.g. 300)
-       8. return the top proposals (-> RoIs top) */
-    auto nms_result = nms(nms_proposals.data(),
-                          nms_scores.size(),
-                          thresh_,
-                          post_nms_top_n_);
-
-    // Output rois blob
-    // Our RPN implementation only supports a single input image, so all
-    // batch inds are 0
-    size = static_cast<int>(nms_result.size());
-    MACE_RETURN_IF_ERROR(output->Resize({size, 1, 1, 5}));
-    auto output_ptr = output->mutable_data<float>();
-#pragma omp parallel for
-    for (int i = 0; i < size; ++i) {
-      const int out_idx = i * 5;
-      const int nms_idx = nms_result[i] * 4;
-      output_ptr[out_idx] = 0;
-      output_ptr[out_idx + 1] = nms_proposals[nms_idx];
-      output_ptr[out_idx + 2] = nms_proposals[nms_idx + 1];
-      output_ptr[out_idx + 3] = nms_proposals[nms_idx + 2];
-      output_ptr[out_idx + 4] = nms_proposals[nms_idx + 3];
-    }
-
-    return MACE_SUCCESS;
-  }
-
-  const int min_size_;
-  const float thresh_;
-  const int pre_nms_top_n_;
-  const int post_nms_top_n_;
-  const int feat_stride_;
-  std::vector<std::vector<float>> anchors_;
-};
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  //  MACE_KERNELS_PROPOSAL_H_
diff --git a/mace/kernels/quantize.h b/mace/kernels/quantize.cc
similarity index 58%
rename from mace/kernels/quantize.h
rename to mace/kernels/quantize.cc
index 337a8316..2f2b8fc2 100644
--- a/mace/kernels/quantize.h
+++ b/mace/kernels/quantize.cc
@@ -12,34 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_QUANTIZE_H_
-#define MACE_KERNELS_QUANTIZE_H_
-
 #include <cmath>
 #include <vector>
 #include <algorithm>
 #include <limits>
 
-#include "mace/core/future.h"
+#include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
 #include "mace/utils/quantize.h"
 
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct QuantizeFunctor;
+template <DeviceType D, class T>
+class QuantizeOp;
 
-template<>
-struct QuantizeFunctor<CPU, uint8_t> : OpKernel {
-  explicit QuantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
+template <>
+class QuantizeOp<DeviceType::CPU, uint8_t> : public Operation {
+ public:
+  explicit QuantizeOp(OpConstructContext *context)
+      : Operation(context),
+        non_zero_(
+            static_cast<bool>(Operation::GetOptionalArg<int>("non_zero", 0))) {}
 
-  MaceStatus operator()(const Tensor *input,
-                        const bool non_zero,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
     const float *input_data = input->data<float>();
@@ -55,29 +55,34 @@ struct QuantizeFunctor<CPU, uint8_t> : OpKernel {
       int32_t zero_point;
       Quantize(input_data,
                input->size(),
-               non_zero,
+               non_zero_,
                output_data,
                &scale,
                &zero_point);
       output->SetScale(scale);
       output->SetZeroPoint(zero_point);
     }
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
+
+ private:
+  bool non_zero_;
 };
 
-template<DeviceType D, typename T>
-struct DequantizeFunctor;
+template <DeviceType D, class T>
+class DequantizeOp;
 
-template<>
-struct DequantizeFunctor<CPU, uint8_t> : OpKernel {
-  explicit DequantizeFunctor(OpKernelContext *context) : OpKernel(context) {}
+template <>
+class DequantizeOp<DeviceType::CPU, uint8_t> : public Operation {
+ public:
+  explicit DequantizeOp(OpConstructContext *context)
+      : Operation(context) {}
 
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
     const uint8_t *input_data = input->data<uint8_t>();
@@ -87,12 +92,18 @@ struct DequantizeFunctor<CPU, uint8_t> : OpKernel {
                input->scale(),
                input->zero_point(),
                output_data);
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
+void RegisterQuantize(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Quantize", QuantizeOp,
+                   DeviceType::CPU, uint8_t);
+}
+
+void RegisterDequantize(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Dequantize", DequantizeOp,
+                   DeviceType::CPU, uint8_t);
+}
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_QUANTIZE_H_
diff --git a/mace/kernels/reduce_mean.h b/mace/kernels/reduce_mean.cc
similarity index 74%
rename from mace/kernels/reduce_mean.h
rename to mace/kernels/reduce_mean.cc
index db00fd41..d103125b 100644
--- a/mace/kernels/reduce_mean.h
+++ b/mace/kernels/reduce_mean.cc
@@ -12,29 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_REDUCE_MEAN_H_
-#define MACE_KERNELS_REDUCE_MEAN_H_
-
-#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
-#include <arm_neon.h>
-#endif
 #include <algorithm>
 #include <memory>
 #include <vector>
 
 #include "mace/core/future.h"
+#include "mace/core/operator.h"
 #include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/reduce_mean.h"
+#endif  // MACE_ENABLE_OPENCL
+
 namespace mace {
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct ReduceMeanFunctor : OpKernel {
-  ReduceMeanFunctor(OpKernelContext *context,
-                    const std::vector<int> &axis,
-                    const bool keep_dims)
-      : OpKernel(context), axis_(axis), keep_dims_(keep_dims) {}
+class ReduceMeanOpBase : public Operation {
+ public:
+  explicit ReduceMeanOpBase(OpConstructContext *context)
+  : Operation(context),
+    axis_(Operation::GetRepeatedArgs<int>("axis")),
+    keep_dims_(Operation::GetOptionalArg<bool>("keepdims", false)) {
+  }
+
+ protected:
+  inline void Validate() {
+    const Tensor *input = this->Input(0);
+    const int left = static_cast<int>(input->dim_size() * -1);
+    const int right = static_cast<int>(input->dim_size());
+    if (axis_.size()) {
+      for (unsigned int i = 0; i < axis_.size(); ++i) {
+        MACE_CHECK(axis_[i] > left && axis_[i] < right, "Axis is over range.");
+      }
+    }
+  }
+
+ protected:
+  const std::vector<int> axis_;
+  bool keep_dims_;
+};
+
+template <DeviceType D, class T>
+class ReduceMeanOp;
+
+template <typename T>
+class ReduceMeanOp<DeviceType::CPU, T> : public ReduceMeanOpBase {
+ public:
+  explicit ReduceMeanOp(OpConstructContext *context)
+      : ReduceMeanOpBase(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Validate();
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    Simplify(input);
+    output->Resize(out_shape_);
+    Compute(input, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
 
+ private:
   void Simplify(const Tensor *input) {
     std::vector<bool> bitmap(static_cast<uint32_t>(input->dim_size()), false);
     if (axis_.size() == 0) {
@@ -190,48 +227,49 @@ struct ReduceMeanFunctor : OpKernel {
     }
   }
 
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    Simplify(input);
-    output->Resize(out_shape_);
-    Compute(input, output);
-    return MACE_SUCCESS;
-  }
-
-  const std::vector<int> axis_;
-  bool keep_dims_;
+ private:
   bool reduce_first_axis_;
   std::vector<int> data_reshape_;
   std::vector<index_t> out_shape_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLReduceMeanKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLReduceMeanKernel);
-};
 template <typename T>
-struct ReduceMeanFunctor<DeviceType::GPU, T> : OpKernel {
-  ReduceMeanFunctor(OpKernelContext *context,
-                    const std::vector<int> &axis,
-                    const bool keep_dims);
+class ReduceMeanOp<DeviceType::GPU, T> : public ReduceMeanOpBase {
+ public:
+  explicit ReduceMeanOp(OpConstructContext *context)
+      : ReduceMeanOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ReduceMeanKernel<T>(axis_, keep_dims_));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    Validate();
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
 
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
+    return kernel_->Compute(context, input, output);
+  }
 
+ private:
   std::unique_ptr<OpenCLReduceMeanKernel> kernel_;
 };
-#endif
+#endif  // MACE_ENABLE_OPENCL
+
+void RegisterReduceMean(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ReduceMean", ReduceMeanOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "ReduceMean", ReduceMeanOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ReduceMean", ReduceMeanOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_REDUCE_MEAN_H_
diff --git a/mace/ops/reshape.h b/mace/kernels/reshape.cc
similarity index 68%
rename from mace/ops/reshape.h
rename to mace/kernels/reshape.cc
index 86476de0..2cfef42b 100644
--- a/mace/ops/reshape.h
+++ b/mace/kernels/reshape.cc
@@ -12,24 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_RESHAPE_H_
-#define MACE_OPS_RESHAPE_H_
-
 #include <vector>
 
 #include "mace/core/operator.h"
-#include "mace/kernels/reshape.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
-template <DeviceType D, typename T>
-class ReshapeOp : public Operator<D, T> {
+template <DeviceType D, class T>
+class ReshapeOp : public Operation {
  public:
-  ReshapeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context), functor_(context) {}
+  explicit ReshapeOp(OpConstructContext *context)
+      : Operation(context) {}
 
-  MaceStatus Run(StatsFuture *future) override {
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
     const Tensor *input = this->Input(INPUT);
     const Tensor *shape = this->Input(SHAPE);
     const index_t num_dims = shape->dim_size() == 0 ? 0 : shape->dim(0);
@@ -63,19 +60,29 @@ class ReshapeOp : public Operator<D, T> {
     }
 
     Tensor *output = this->Output(OUTPUT);
+    output->ReuseTensorBuffer(*input);
+    output->Reshape(out_shape);
 
-    return functor_(input, out_shape, output, future);
+    return MaceStatus::MACE_SUCCESS;
   }
 
- private:
-  kernels::ReshapeFunctor<D, T> functor_;
-
  private:
   MACE_OP_INPUT_TAGS(INPUT, SHAPE);
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-}  // namespace ops
-}  // namespace mace
+void RegisterReshape(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp,
+                   DeviceType::CPU, int32_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp,
+                   DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "Reshape", ReshapeOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 
-#endif  // MACE_OPS_RESHAPE_H_
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/resize_bicubic.cc b/mace/kernels/resize_bicubic.cc
new file mode 100644
index 00000000..fe0512ff
--- /dev/null
+++ b/mace/kernels/resize_bicubic.cc
@@ -0,0 +1,234 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/resize_bicubic.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/resize_bicubic.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+inline const std::shared_ptr<float> InitCoeffsTable() {
+  // Allocate and initialize coefficients table using Bicubic
+  // convolution algorithm.
+  // https://en.wikipedia.org/wiki/Bicubic_interpolation
+  auto coeffs_tab = std::shared_ptr<float>(
+      new float[(resize_bicubic::kTableSize + 1) * 2],
+      std::default_delete<float[]>());
+  float *coeffs_tab_ptr = coeffs_tab.get();
+  static const float A = -0.75f;
+  for (int i = 0; i <= resize_bicubic::kTableSize; ++i) {
+    float x = i * 1.0f / resize_bicubic::kTableSize;
+    coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
+    x += 1.0;
+    coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+  }
+  return coeffs_tab;
+}
+
+inline const float *GetCoeffsTable() {
+  // Static so that we initialize it on first use
+  static const std::shared_ptr<float> coeffs_tab = InitCoeffsTable();
+  return coeffs_tab.get();
+}
+
+inline int64_t Bound(int64_t val, int64_t limit) {
+  return std::min<int64_t>(limit - 1ll, std::max<int64_t>(0ll, val));
+}
+
+inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit,
+                                 std::vector<float> *weights,
+                                 std::vector<int64_t> *indices) {
+  auto in_loc = static_cast<int64_t>(scale * out_loc);
+  const float delta = scale * out_loc - in_loc;
+  const int64_t offset = lrintf(delta * resize_bicubic::kTableSize);
+  const float *coeffs_tab = GetCoeffsTable();
+  *weights = {coeffs_tab[offset * 2 + 1],
+              coeffs_tab[offset * 2],
+              coeffs_tab[(resize_bicubic::kTableSize - offset) * 2],
+              coeffs_tab[(resize_bicubic::kTableSize - offset) * 2 + 1]};
+  *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit),
+              Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)};
+}
+
+inline float Interpolate1D(const std::vector<float> &weights,
+                           const std::vector<float> &values) {
+  return values[0] * weights[0] + values[1] * weights[1] +
+      values[2] * weights[2] + values[3] * weights[3];
+}
+
+inline void ResizeImage(const float *images,
+                        const index_t batch_size,
+                        const index_t in_height,
+                        const index_t in_width,
+                        const index_t out_height,
+                        const index_t out_width,
+                        const index_t channels,
+                        const float height_scale,
+                        const float width_scale,
+                        float *output) {
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < batch_size; ++b) {
+    for (index_t y = 0; y < out_height; ++y) {
+      std::vector<float> y_weights;
+      std::vector<index_t> y_indices;
+      GetWeightsAndIndices(height_scale, y, in_height, &y_weights,
+                           &y_indices);
+      for (index_t x = 0; x < out_width; ++x) {
+        std::vector<float> x_weights;
+        std::vector<index_t> x_indices;
+        GetWeightsAndIndices(width_scale, x, in_width, &x_weights,
+                             &x_indices);
+
+        for (index_t c = 0; c < channels; ++c) {
+          // Use a 4x4 patch to compute the interpolated output value at
+          // (b, y, x, c).
+          const float *channel_input_ptr =
+              images + (b * channels + c) * in_height * in_width;
+          float *channel_output_ptr =
+              output + (b * channels + c) * out_height * out_width;
+          std::vector<float> coeff(4, 0.0);
+          for (index_t i = 0; i < 4; ++i) {
+            const std::vector<float> values = {
+                channel_input_ptr[y_indices[i] * in_width + x_indices[0]],
+                channel_input_ptr[y_indices[i] * in_width + x_indices[1]],
+                channel_input_ptr[y_indices[i] * in_width + x_indices[2]],
+                channel_input_ptr[y_indices[i] * in_width + x_indices[3]]};
+            coeff[i] = Interpolate1D(x_weights, values);
+          }
+          channel_output_ptr[y * out_width + x] =
+              Interpolate1D(y_weights, coeff);
+        }
+      }
+    }
+  }
+}
+
+template <DeviceType D, class T>
+class ResizeBicubicOp;
+
+template <>
+class ResizeBicubicOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit ResizeBicubicOp(OpConstructContext *context)
+      : Operation(context),
+        align_corners_(Operation::GetOptionalArg<bool>("align_corners", false)),
+        size_(Operation::GetRepeatedArgs<index_t>("size", {-1, -1})) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    MACE_CHECK(size_.size() == 2);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
+               input->dim_size());
+    const index_t batch = input->dim(0);
+    const index_t channels = input->dim(1);
+    const index_t in_height = input->dim(2);
+    const index_t in_width = input->dim(3);
+
+    index_t out_height = size_[0];
+    index_t out_width = size_[1];
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
+    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard output_mapper(output);
+    const float *input_data = input->data<float>();
+    float *output_data = output->mutable_data<float>();
+
+    if (out_height == in_height && out_width == in_width) {
+      std::copy(input_data,
+                input_data + batch * channels * in_height * in_width,
+                output_data);
+      return MaceStatus::MACE_SUCCESS;
+    }
+
+    float height_scale =
+        resize_bicubic::CalculateResizeScale(in_height,
+                                             out_height,
+                                             align_corners_);
+    float width_scale =
+        resize_bicubic::CalculateResizeScale(in_width,
+                                             out_width,
+                                             align_corners_);
+
+    ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
+                channels, height_scale, width_scale, output_data);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  bool align_corners_;
+  std::vector<index_t> size_;
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class ResizeBicubicOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit ResizeBicubicOp(OpConstructContext *context)
+      : Operation(context) {
+    bool align_corners = Operation::GetOptionalArg<bool>(
+        "align_corners", false);
+    std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
+        "size", {-1, -1});
+    MACE_CHECK(size.size() == 2);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ResizeBicubicKernel<T>(align_corners,
+                                                              size[0],
+                                                              size[1]));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
+               input->dim_size());
+
+    return kernel_->Compute(context, input, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLResizeBicubicKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+void RegisterResizeBicubic(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ResizeBicubic", ResizeBicubicOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/resize_bicubic.h b/mace/kernels/resize_bicubic.h
index a33e0549..5e02edd4 100644
--- a/mace/kernels/resize_bicubic.h
+++ b/mace/kernels/resize_bicubic.h
@@ -15,68 +15,12 @@
 #ifndef MACE_KERNELS_RESIZE_BICUBIC_H_
 #define MACE_KERNELS_RESIZE_BICUBIC_H_
 
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/utils/logging.h"
+#include "mace/core/types.h"
 
 namespace mace {
 namespace kernels {
-
-static const int64_t kTableSize = (1 << 10);
-
-inline const std::shared_ptr<float> InitCoeffsTable() {
-  // Allocate and initialize coefficients table using Bicubic
-  // convolution algorithm.
-  // https://en.wikipedia.org/wiki/Bicubic_interpolation
-  auto coeffs_tab = std::shared_ptr<float>(new float[(kTableSize + 1) * 2],
-                                          std::default_delete<float[]>());
-  float *coeffs_tab_ptr = coeffs_tab.get();
-  static const double A = -0.75;
-  for (int i = 0; i <= kTableSize; ++i) {
-    float x = i * 1.0 / kTableSize;
-    coeffs_tab_ptr[i * 2] = ((A + 2) * x - (A + 3)) * x * x + 1;
-    x += 1.0;
-    coeffs_tab_ptr[i * 2 + 1] = ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
-  }
-  return coeffs_tab;
-}
-
-inline const float *GetCoeffsTable() {
-  // Static so that we initialize it on first use
-  static const std::shared_ptr<float> coeffs_tab = InitCoeffsTable();
-  return coeffs_tab.get();
-}
-
-inline int64_t Bound(int64_t val, int64_t limit) {
-  return std::min<int64_t>(limit - 1ll, std::max<int64_t>(0ll, val));
-}
-
-inline void GetWeightsAndIndices(float scale, int64_t out_loc, int64_t limit,
-                                 std::vector<float> *weights,
-                                 std::vector<int64_t> *indices) {
-  const int64_t in_loc = scale * out_loc;
-  const float delta = scale * out_loc - in_loc;
-  const int64_t offset = lrintf(delta * kTableSize);
-  const float *coeffs_tab = GetCoeffsTable();
-  *weights = {coeffs_tab[offset * 2 + 1],
-              coeffs_tab[offset * 2],
-              coeffs_tab[(kTableSize - offset) * 2],
-              coeffs_tab[(kTableSize - offset) * 2 + 1]};
-  *indices = {Bound(in_loc - 1, limit), Bound(in_loc, limit),
-              Bound(in_loc + 1, limit), Bound(in_loc + 2, limit)};
-}
-
-inline float Interpolate1D(const std::vector<float> &weights,
-                           const std::vector<float> &values) {
-  return values[0] * weights[0] + values[1] * weights[1] +
-         values[2] * weights[2] + values[3] * weights[3];
-}
+namespace resize_bicubic {
+constexpr int64_t kTableSize = (1u << 10);
 
 inline float CalculateResizeScale(index_t in_size,
                                   index_t out_size,
@@ -85,140 +29,7 @@ inline float CalculateResizeScale(index_t in_size,
          ? (in_size - 1) / static_cast<float>(out_size - 1)
          : in_size / static_cast<float>(out_size);
 }
-
-inline void ResizeImage(const float *images,
-                        const index_t batch_size,
-                        const index_t in_height,
-                        const index_t in_width,
-                        const index_t out_height,
-                        const index_t out_width,
-                        const index_t channels,
-                        const float height_scale,
-                        const float width_scale,
-                        float *output) {
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t y = 0; y < out_height; ++y) {
-      std::vector<float> y_weights;
-      std::vector<index_t> y_indices;
-      GetWeightsAndIndices(height_scale, y, in_height, &y_weights,
-                           &y_indices);
-      for (index_t x = 0; x < out_width; ++x) {
-        std::vector<float> x_weights;
-        std::vector<index_t> x_indices;
-        GetWeightsAndIndices(width_scale, x, in_width, &x_weights,
-                             &x_indices);
-
-        for (index_t c = 0; c < channels; ++c) {
-          // Use a 4x4 patch to compute the interpolated output value at
-          // (b, y, x, c).
-          const float *channel_input_ptr =
-                  images + (b * channels + c) * in_height * in_width;
-          float *channel_output_ptr =
-                  output + (b * channels + c) * out_height * out_width;
-          std::vector<float> coeff(4, 0.0);
-          for (index_t i = 0; i < 4; ++i) {
-            const std::vector<float> values = {
-              static_cast<float>(channel_input_ptr
-                  [y_indices[i] * in_width + x_indices[0]]),
-              static_cast<float>(channel_input_ptr
-                  [y_indices[i] * in_width + x_indices[1]]),
-              static_cast<float>(channel_input_ptr
-                  [y_indices[i] * in_width + x_indices[2]]),
-              static_cast<float>(channel_input_ptr
-                  [y_indices[i] * in_width + x_indices[3]])};
-            coeff[i] = Interpolate1D(x_weights, values);
-          }
-          channel_output_ptr[y * out_width + x] =
-                  Interpolate1D(y_weights, coeff);
-        }
-      }
-    }
-  }
-}
-
-template<DeviceType D, typename T>
-struct ResizeBicubicFunctor;
-
-template<>
-struct ResizeBicubicFunctor<DeviceType::CPU, float> : OpKernel {
-  ResizeBicubicFunctor(OpKernelContext *context,
-                       const bool align_corners,
-                       const std::vector<index_t> &size)
-      : OpKernel(context),
-        align_corners_(align_corners) {
-    MACE_CHECK(size.size() == 2);
-    out_height_ = size[0];
-    out_width_ = size[1];
-  }
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    const index_t batch = input->dim(0);
-    const index_t channels = input->dim(1);
-    const index_t in_height = input->dim(2);
-    const index_t in_width = input->dim(3);
-
-    index_t out_height = out_height_;
-    index_t out_width = out_width_;
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
-    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard output_mapper(output);
-    const float *input_data = input->data<float>();
-    float *output_data = output->mutable_data<float>();
-
-    if (out_height == in_height && out_width == in_width) {
-      std::copy(input_data,
-                input_data + batch * channels * in_height * in_width,
-                output_data);
-      return MACE_SUCCESS;
-    }
-
-    float height_scale =
-        CalculateResizeScale(in_height, out_height, align_corners_);
-    float width_scale =
-        CalculateResizeScale(in_width, out_width, align_corners_);
-
-    ResizeImage(input_data, batch, in_height, in_width, out_height, out_width,
-            channels, height_scale, width_scale, output_data);
-
-    return MACE_SUCCESS;
-  }
-
-  bool align_corners_;
-  index_t out_height_;
-  index_t out_width_;
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLResizeBicubicKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBicubicKernel);
-};
-template<typename T>
-struct ResizeBicubicFunctor<DeviceType::GPU, T>
-    : OpKernel {
-  ResizeBicubicFunctor(OpKernelContext *context,
-                       bool align_corners,
-                       const std::vector<index_t> &size);
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLResizeBicubicKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
+}  // namespace resize_bicubic
 
 }  // namespace kernels
 }  // namespace mace
diff --git a/mace/kernels/resize_bilinear.cc b/mace/kernels/resize_bilinear.cc
new file mode 100644
index 00000000..8ea86158
--- /dev/null
+++ b/mace/kernels/resize_bilinear.cc
@@ -0,0 +1,371 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/kernels/resize_bilinear.h"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "mace/core/operator.h"
+#include "mace/utils/quantize.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/resize_bilinear.h"
+#endif  // MACE_ENABLE_OPENCL
+
+namespace mace {
+namespace kernels {
+
+struct CachedInterpolation {
+  index_t lower;  // Lower source index used in the interpolation
+  index_t upper;  // Upper source index used in the interpolation
+  // 1-D linear iterpolation scale (see:
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  float lerp;
+};
+
+inline void ComputeInterpolationWeights(
+    const index_t out_size,
+    const index_t in_size,
+    const float scale,
+    CachedInterpolation *interpolation) {
+  interpolation[out_size].lower = 0;
+  interpolation[out_size].upper = 0;
+  for (index_t i = out_size - 1; i >= 0; --i) {
+    const float in = i * scale;
+    interpolation[i].lower = static_cast<index_t>(in);
+    interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1);
+    interpolation[i].lerp = in - interpolation[i].lower;
+  }
+}
+
+template <typename T>
+inline T ComputeLerp(const T top_left,
+                     const T top_right,
+                     const T bottom_left,
+                     const T bottom_right,
+                     const float x_lerp,
+                     const float y_lerp);
+
+template <>
+inline float ComputeLerp<float>(const float top_left,
+                                const float top_right,
+                                const float bottom_left,
+                                const float bottom_right,
+                                const float x_lerp,
+                                const float y_lerp) {
+  const float top = top_left + (top_right - top_left) * x_lerp;
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  return top + (bottom - top) * y_lerp;
+}
+
+template <>
+inline uint8_t ComputeLerp<uint8_t>(const uint8_t top_left,
+                                    const uint8_t top_right,
+                                    const uint8_t bottom_left,
+                                    const uint8_t bottom_right,
+                                    const float x_lerp,
+                                    const float y_lerp) {
+  const float top = top_left + (top_right - top_left) * x_lerp;
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  return Saturate<uint8_t>(roundf(top + (bottom - top) * y_lerp));
+}
+
+template <typename T>
+inline void ResizeImageNCHW(const T *images,
+                            const index_t batch_size,
+                            const index_t in_height,
+                            const index_t in_width,
+                            const index_t out_height,
+                            const index_t out_width,
+                            const index_t channels,
+                            const std::vector<CachedInterpolation> &xs_vec,
+                            const std::vector<CachedInterpolation> &ys,
+                            T *output) {
+  const CachedInterpolation *xs = xs_vec.data();
+
+#pragma omp parallel for collapse(2)
+  for (index_t b = 0; b < batch_size; ++b) {
+    for (index_t c = 0; c < channels; ++c) {
+      const T
+          *channel_input_ptr =
+          images + (b * channels + c) * in_height * in_width;
+      T *channel_output_ptr =
+          output + (b * channels + c) * out_height * out_width;
+      for (index_t y = 0; y < out_height; ++y) {
+        const T *y_lower_input_ptr =
+            channel_input_ptr + ys[y].lower * in_width;
+        const T *y_upper_input_ptr =
+            channel_input_ptr + ys[y].upper * in_width;
+        const float ys_lerp = ys[y].lerp;
+
+        for (index_t x = 0; x < out_width; ++x) {
+          const float xs_lerp = xs[x].lerp;
+          const T top_left = y_lower_input_ptr[xs[x].lower];
+          const T top_right = y_lower_input_ptr[xs[x].upper];
+          const T bottom_left = y_upper_input_ptr[xs[x].lower];
+          const T bottom_right = y_upper_input_ptr[xs[x].upper];
+          channel_output_ptr[y * out_width + x] =
+              ComputeLerp(top_left, top_right, bottom_left,
+                          bottom_right, xs_lerp, ys_lerp);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void ResizeImageNHWC(const T *images,
+                            const index_t batch_size,
+                            const index_t in_height,
+                            const index_t in_width,
+                            const index_t out_height,
+                            const index_t out_width,
+                            const index_t channels,
+                            const std::vector<CachedInterpolation> &xs_vec,
+                            const std::vector<CachedInterpolation> &ys,
+                            T *output) {
+  const CachedInterpolation *xs = xs_vec.data();
+
+  for (index_t b = 0; b < batch_size; ++b) {
+    const T *input_base = images + b * channels * in_height * in_width;
+    T *output_base = output + b * channels * out_height * out_width;
+#pragma omp parallel for
+    for (index_t y = 0; y < out_height; ++y) {
+      const T
+          *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
+      const T
+          *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels;
+      const float ys_lerp = ys[y].lerp;
+
+      for (index_t x = 0; x < out_width; ++x) {
+        const float xs_lerp = xs[x].lerp;
+        const T *top_left = y_lower_input_ptr + xs[x].lower * channels;
+        const T *top_right = y_lower_input_ptr + xs[x].upper * channels;
+        const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels;
+        const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels;
+
+        T *output_ptr = output_base + (y * out_width + x) * channels;
+        for (index_t c = 0; c < channels; ++c) {
+          output_ptr[c] =
+              ComputeLerp(top_left[c], top_right[c], bottom_left[c],
+                          bottom_right[c], xs_lerp, ys_lerp);
+        }
+      }
+    }
+  }
+}
+
+template <DeviceType D, typename T>
+class ResizeBilinearOp;
+
+template <typename T>
+class ResizeBilinearOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit ResizeBilinearOp(OpConstructContext *context)
+      : Operation(context),
+        align_corners_(Operation::GetOptionalArg<bool>("align_corners", false)),
+        size_(Operation::GetRepeatedArgs<index_t>("size", {-1, -1})) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    MACE_CHECK(size_.size() == 2);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
+               input->dim_size());
+    const index_t batch = input->dim(0);
+    const index_t channels = input->dim(1);
+    const index_t in_height = input->dim(2);
+    const index_t in_width = input->dim(3);
+
+    index_t out_height = size_[0];
+    index_t out_width = size_[1];
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
+    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard output_mapper(output);
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>();
+
+    if (out_height == in_height && out_width == in_width) {
+      std::copy(input_data,
+                input_data + batch * channels * in_height * in_width,
+                output_data);
+      return MaceStatus::MACE_SUCCESS;
+    }
+
+    float height_scale =
+        resize_bilinear::CalculateResizeScale(in_height,
+                                              out_height,
+                                              align_corners_);
+    float width_scale =
+        resize_bilinear::CalculateResizeScale(in_width,
+                                              out_width,
+                                              align_corners_);
+
+    std::vector<CachedInterpolation> ys(out_height + 1);
+    std::vector<CachedInterpolation> xs(out_width + 1);
+
+    // Compute the cached interpolation weights on the x and y dimensions.
+    ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
+    ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
+
+    ResizeImageNCHW(input_data,
+                    batch,
+                    in_height,
+                    in_width,
+                    out_height,
+                    out_width,
+                    channels,
+                    xs,
+                    ys,
+                    output_data);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  bool align_corners_;
+  std::vector<index_t> size_;
+};
+
+template <>
+class ResizeBilinearOp<DeviceType::CPU, uint8_t> : public Operation {
+ public:
+  explicit ResizeBilinearOp(OpConstructContext *context)
+      : Operation(context),
+        align_corners_(Operation::GetOptionalArg<bool>("align_corners", false)),
+        size_(Operation::GetRepeatedArgs<index_t>("size", {-1, -1})) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    MACE_CHECK(size_.size() == 2);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
+               input->dim_size());
+    const index_t batch = input->dim(0);
+    const index_t in_height = input->dim(1);
+    const index_t in_width = input->dim(2);
+    const index_t channels = input->dim(3);
+
+    index_t out_height = size_[0];
+    index_t out_width = size_[1];
+    MACE_CHECK(out_height > 0 && out_width > 0);
+    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
+    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
+
+    Tensor::MappingGuard input_mapper(input);
+    Tensor::MappingGuard output_mapper(output);
+    const uint8_t *input_data = input->data<uint8_t>();
+    uint8_t *output_data = output->mutable_data<uint8_t>();
+
+    if (out_height == in_height && out_width == in_width) {
+      std::copy(input_data,
+                input_data + batch * in_height * in_width * channels ,
+                output_data);
+      return MaceStatus::MACE_SUCCESS;
+    }
+
+    float height_scale =
+        resize_bilinear::CalculateResizeScale(in_height,
+                                              out_height,
+                                              align_corners_);
+    float width_scale =
+        resize_bilinear::CalculateResizeScale(in_width,
+                                              out_width,
+                                              align_corners_);
+
+    std::vector<CachedInterpolation> ys(out_height + 1);
+    std::vector<CachedInterpolation> xs(out_width + 1);
+
+    // Compute the cached interpolation weights on the x and y dimensions.
+    ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
+    ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
+
+    ResizeImageNHWC(input_data,
+                    batch,
+                    in_height,
+                    in_width,
+                    out_height,
+                    out_width,
+                    channels,
+                    xs,
+                    ys,
+                    output_data);
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
+  bool align_corners_;
+  std::vector<index_t> size_;
+};
+
+#ifdef MACE_ENABLE_OPENCL
+template <typename T>
+class ResizeBilinearOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit ResizeBilinearOp(OpConstructContext *context)
+      : Operation(context) {
+    bool align_corners = Operation::GetOptionalArg<bool>(
+        "align_corners", false);
+    std::vector<index_t> size = Operation::GetRepeatedArgs<index_t>(
+        "size", {-1, -1});
+    MACE_CHECK(size.size() == 2);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::ResizeBilinearKernel<T>(align_corners,
+                                                               size[0],
+                                                               size[1]));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
+               input->dim_size());
+
+    return kernel_->Compute(context, input, output);
+  }
+
+ private:
+  std::unique_ptr<OpenCLResizeBilinearKernel> kernel_;
+};
+#endif  // MACE_ENABLE_OPENCL
+
+void RegisterResizeBilinear(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
+                   DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
+                   DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "ResizeBilinear", ResizeBilinearOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h
index ea3f7aa3..1f94e500 100644
--- a/mace/kernels/resize_bilinear.h
+++ b/mace/kernels/resize_bilinear.h
@@ -15,26 +15,11 @@
 #ifndef MACE_KERNELS_RESIZE_BILINEAR_H_
 #define MACE_KERNELS_RESIZE_BILINEAR_H_
 
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/utils/quantize.h"
+#include "mace/core/types.h"
 
 namespace mace {
 namespace kernels {
-
-struct CachedInterpolation {
-  index_t lower;  // Lower source index used in the interpolation
-  index_t upper;  // Upper source index used in the interpolation
-  // 1-D linear iterpolation scale (see:
-  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  float lerp;
-};
-
+namespace resize_bilinear {
 inline float CalculateResizeScale(index_t in_size,
                                   index_t out_size,
                                   bool align_corners) {
@@ -42,302 +27,7 @@ inline float CalculateResizeScale(index_t in_size,
          ? (in_size - 1) / static_cast<float>(out_size - 1)
          : in_size / static_cast<float>(out_size);
 }
-
-inline void ComputeInterpolationWeights(
-    const index_t out_size,
-    const index_t in_size,
-    const float scale,
-    CachedInterpolation *interpolation) {
-  interpolation[out_size].lower = 0;
-  interpolation[out_size].upper = 0;
-  for (index_t i = out_size - 1; i >= 0; --i) {
-    const float in = i * scale;
-    interpolation[i].lower = static_cast<index_t>(in);
-    interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1);
-    interpolation[i].lerp = in - interpolation[i].lower;
-  }
-}
-
-template <typename T>
-inline T ComputeLerp(const T top_left,
-                     const T top_right,
-                     const T bottom_left,
-                     const T bottom_right,
-                     const float x_lerp,
-                     const float y_lerp);
-
-template <>
-inline float ComputeLerp<float>(const float top_left,
-                                const float top_right,
-                                const float bottom_left,
-                                const float bottom_right,
-                                const float x_lerp,
-                                const float y_lerp) {
-  const float top = top_left + (top_right - top_left) * x_lerp;
-  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-  return top + (bottom - top) * y_lerp;
-}
-
-template <>
-inline uint8_t ComputeLerp<uint8_t>(const uint8_t top_left,
-                                    const uint8_t top_right,
-                                    const uint8_t bottom_left,
-                                    const uint8_t bottom_right,
-                                    const float x_lerp,
-                                    const float y_lerp) {
-  const float top = top_left + (top_right - top_left) * x_lerp;
-  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-  return Saturate<uint8_t>(roundf(top + (bottom - top) * y_lerp));
-}
-
-template <typename T>
-inline void ResizeImageNCHW(const T *images,
-                            const index_t batch_size,
-                            const index_t in_height,
-                            const index_t in_width,
-                            const index_t out_height,
-                            const index_t out_width,
-                            const index_t channels,
-                            const std::vector<CachedInterpolation> &xs_vec,
-                            const std::vector<CachedInterpolation> &ys,
-                            T *output) {
-  const CachedInterpolation *xs = xs_vec.data();
-
-#pragma omp parallel for collapse(2)
-  for (index_t b = 0; b < batch_size; ++b) {
-    for (index_t c = 0; c < channels; ++c) {
-      const T
-          *channel_input_ptr =
-          images + (b * channels + c) * in_height * in_width;
-      T *channel_output_ptr =
-          output + (b * channels + c) * out_height * out_width;
-      for (index_t y = 0; y < out_height; ++y) {
-        const T *y_lower_input_ptr =
-            channel_input_ptr + ys[y].lower * in_width;
-        const T *y_upper_input_ptr =
-            channel_input_ptr + ys[y].upper * in_width;
-        const float ys_lerp = ys[y].lerp;
-
-        for (index_t x = 0; x < out_width; ++x) {
-          const float xs_lerp = xs[x].lerp;
-          const T top_left = y_lower_input_ptr[xs[x].lower];
-          const T top_right = y_lower_input_ptr[xs[x].upper];
-          const T bottom_left = y_upper_input_ptr[xs[x].lower];
-          const T bottom_right = y_upper_input_ptr[xs[x].upper];
-          channel_output_ptr[y * out_width + x] =
-              ComputeLerp(top_left, top_right, bottom_left,
-                          bottom_right, xs_lerp, ys_lerp);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void ResizeImageNHWC(const T *images,
-                            const index_t batch_size,
-                            const index_t in_height,
-                            const index_t in_width,
-                            const index_t out_height,
-                            const index_t out_width,
-                            const index_t channels,
-                            const std::vector<CachedInterpolation> &xs_vec,
-                            const std::vector<CachedInterpolation> &ys,
-                            T *output) {
-  const CachedInterpolation *xs = xs_vec.data();
-
-  for (index_t b = 0; b < batch_size; ++b) {
-    const T *input_base = images + b * channels * in_height * in_width;
-    T *output_base = output + b * channels * out_height * out_width;
-#pragma omp parallel for
-    for (index_t y = 0; y < out_height; ++y) {
-      const T
-          *y_lower_input_ptr = input_base + ys[y].lower * in_width * channels;
-      const T
-          *y_upper_input_ptr = input_base + ys[y].upper * in_width * channels;
-      const float ys_lerp = ys[y].lerp;
-
-      for (index_t x = 0; x < out_width; ++x) {
-        const float xs_lerp = xs[x].lerp;
-        const T *top_left = y_lower_input_ptr + xs[x].lower * channels;
-        const T *top_right = y_lower_input_ptr + xs[x].upper * channels;
-        const T *bottom_left = y_upper_input_ptr + xs[x].lower * channels;
-        const T *bottom_right = y_upper_input_ptr + xs[x].upper * channels;
-
-        T *output_ptr = output_base + (y * out_width + x) * channels;
-        for (index_t c = 0; c < channels; ++c) {
-          output_ptr[c] =
-              ComputeLerp(top_left[c], top_right[c], bottom_left[c],
-                          bottom_right[c], xs_lerp, ys_lerp);
-        }
-      }
-    }
-  }
-}
-
-template<DeviceType D, typename T>
-struct ResizeBilinearFunctor : OpKernel {
-  ResizeBilinearFunctor(OpKernelContext *context,
-                        const std::vector<index_t> &size,
-                        bool align_corners)
-      : OpKernel(context), align_corners_(align_corners) {
-    MACE_CHECK(size.size() == 2);
-    out_height_ = size[0];
-    out_width_ = size[1];
-  }
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    const index_t batch = input->dim(0);
-    const index_t channels = input->dim(1);
-    const index_t in_height = input->dim(2);
-    const index_t in_width = input->dim(3);
-
-    index_t out_height = out_height_;
-    index_t out_width = out_width_;
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, channels, out_height, out_width};
-    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard output_mapper(output);
-    const T *input_data = input->data<T>();
-    T *output_data = output->mutable_data<T>();
-
-    if (out_height == in_height && out_width == in_width) {
-      std::copy(input_data,
-                input_data + batch * channels * in_height * in_width,
-                output_data);
-      return MACE_SUCCESS;
-    }
-
-    float height_scale =
-        CalculateResizeScale(in_height, out_height, align_corners_);
-    float width_scale =
-        CalculateResizeScale(in_width, out_width, align_corners_);
-
-    std::vector<CachedInterpolation> ys(out_height + 1);
-    std::vector<CachedInterpolation> xs(out_width + 1);
-
-    // Compute the cached interpolation weights on the x and y dimensions.
-    ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
-    ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
-
-    ResizeImageNCHW(input_data,
-                    batch,
-                    in_height,
-                    in_width,
-                    out_height,
-                    out_width,
-                    channels,
-                    xs,
-                    ys,
-                    output_data);
-
-    return MACE_SUCCESS;
-  }
-
-  bool align_corners_;
-  index_t out_height_;
-  index_t out_width_;
-};
-
-template<DeviceType D>
-struct ResizeBilinearFunctor<D, uint8_t> : OpKernel {
-  ResizeBilinearFunctor(OpKernelContext *context,
-                        const std::vector<index_t> &size,
-                        bool align_corners)
-      : OpKernel(context), align_corners_(align_corners) {
-    MACE_CHECK(size.size() == 2);
-    out_height_ = size[0];
-    out_width_ = size[1];
-  }
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-    const index_t batch = input->dim(0);
-    const index_t in_height = input->dim(1);
-    const index_t in_width = input->dim(2);
-    const index_t channels = input->dim(3);
-
-    index_t out_height = out_height_;
-    index_t out_width = out_width_;
-    MACE_CHECK(out_height > 0 && out_width > 0);
-    std::vector<index_t> out_shape{batch, out_height, out_width, channels};
-    MACE_RETURN_IF_ERROR(output->Resize(out_shape));
-
-    Tensor::MappingGuard input_mapper(input);
-    Tensor::MappingGuard output_mapper(output);
-    const uint8_t *input_data = input->data<uint8_t>();
-    uint8_t *output_data = output->mutable_data<uint8_t>();
-
-    if (out_height == in_height && out_width == in_width) {
-      std::copy(input_data,
-                input_data + batch * in_height * in_width * channels ,
-                output_data);
-      return MACE_SUCCESS;
-    }
-
-    float height_scale =
-        CalculateResizeScale(in_height, out_height, align_corners_);
-    float width_scale =
-        CalculateResizeScale(in_width, out_width, align_corners_);
-
-    std::vector<CachedInterpolation> ys(out_height + 1);
-    std::vector<CachedInterpolation> xs(out_width + 1);
-
-    // Compute the cached interpolation weights on the x and y dimensions.
-    ComputeInterpolationWeights(out_height, in_height, height_scale, ys.data());
-    ComputeInterpolationWeights(out_width, in_width, width_scale, xs.data());
-
-    ResizeImageNHWC(input_data,
-                    batch,
-                    in_height,
-                    in_width,
-                    out_height,
-                    out_width,
-                    channels,
-                    xs,
-                    ys,
-                    output_data);
-
-    return MACE_SUCCESS;
-  }
-
-  bool align_corners_;
-  index_t out_height_;
-  index_t out_width_;
-};
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLResizeBilinearKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLResizeBilinearKernel);
-};
-template<typename T>
-struct ResizeBilinearFunctor<DeviceType::GPU, T>
-    : OpKernel {
-  ResizeBilinearFunctor(OpKernelContext *context,
-                        const std::vector<index_t> &size,
-                        bool align_corners);
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLResizeBilinearKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
+}  // namespace resize_bilinear
 }  // namespace kernels
 }  // namespace mace
 
diff --git a/mace/kernels/reverse.h b/mace/kernels/reverse.cc
similarity index 73%
rename from mace/kernels/reverse.h
rename to mace/kernels/reverse.cc
index 69d5fd6d..f73db418 100644
--- a/mace/kernels/reverse.h
+++ b/mace/kernels/reverse.cc
@@ -12,38 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_REVERSE_H_
-#define MACE_KERNELS_REVERSE_H_
-
-#include <functional>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-
-#ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif  // MACE_ENABLE_OPENCL
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct ReverseFunctor;
+class ReverseOp;
 
 template <typename T>
-struct ReverseFunctor<DeviceType::CPU, T> : OpKernel {
-  explicit ReverseFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *axis,
-                        Tensor *output,
-                        StatsFuture *future) {
+class ReverseOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit ReverseOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *axis = this->Input(AXIS);
+    Tensor *output = this->Output(OUTPUT);
+
     MACE_CHECK(axis->dim_size() == 1, "Only support reverse in one axis now");
 
     const int32_t *axis_data = axis->data<int32_t>();
     const index_t reverse_dim = *axis_data >= 0 ?
-      *axis_data : *axis_data + input->dim_size();
+                                *axis_data : *axis_data + input->dim_size();
     MACE_CHECK(reverse_dim >= 0 && reverse_dim < input->dim_size(),
                "axis must be in the range [-rank(input), rank(input))");
 
@@ -71,13 +64,18 @@ struct ReverseFunctor<DeviceType::CPU, T> : OpKernel {
         input_idx += low_dim_elem_size;
       }
     }
-
-    SetFutureDefaultWaitFn(future);
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
+
+ private:
+  MACE_OP_INPUT_TAGS(INPUT, AXIS);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
+void RegisterReverse(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Reverse", ReverseOp,
+                   DeviceType::CPU, float);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_REVERSE_H_
diff --git a/mace/kernels/scalar_math.h b/mace/kernels/scalar_math.cc
similarity index 72%
rename from mace/kernels/scalar_math.h
rename to mace/kernels/scalar_math.cc
index 928a4954..f9f4822a 100644
--- a/mace/kernels/scalar_math.h
+++ b/mace/kernels/scalar_math.cc
@@ -12,15 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_SCALAR_MATH_H_
-#define MACE_KERNELS_SCALAR_MATH_H_
-
 #include <algorithm>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 #include "mace/kernels/eltwise.h"
 
 namespace mace {
@@ -89,23 +84,22 @@ void ScalarEltwise(const T* in0,
 
 
 template <DeviceType D, typename T>
-struct ScalarMathFunctor : OpKernel {
-  ScalarMathFunctor(OpKernelContext *context,
-                    const EltwiseType type,
-                    const std::vector<float> &coeff,
-                    const float scalar_input,
-                    const int32_t scalar_input_index)
-      : OpKernel(context),
-        type_(type),
-        coeff_(coeff),
-        scalar_input_(scalar_input),
-        scalar_input_index_(scalar_input_index) {}
+class ScalarMathOp : public Operation {
+ public:
+  explicit ScalarMathOp(OpConstructContext *context)
+      : Operation(context),
+        type_(static_cast<kernels::EltwiseType>(Operation::GetOptionalArg<int>(
+            "type", static_cast<int>(kernels::EltwiseType::NONE)))),
+        coeff_(Operation::GetRepeatedArgs<float>("coeff")),
+        scalar_input_(Operation::GetOptionalArg<float>("scalar_input", 1.0)),
+        scalar_input_index_(Operation::GetOptionalArg<int32_t>(
+            "scalar_input_index", 1)) {}
 
-  MaceStatus operator()(const std::vector<const Tensor *> &inputs,
-                        Tensor *output,
-                        StatsFuture *future) {
-    const Tensor* input0 = inputs[0];
-    const Tensor* input1 = (inputs.size() >= 2) ? inputs[1] : nullptr;
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    Tensor *output = this->Output(0);
+    const Tensor* input0 = inputs_[0];
+    const Tensor* input1 = (inputs_.size() >= 2) ? inputs_[1] : nullptr;
     MACE_CHECK(input0->dim_size() <= 1 && input0->size() == 1,
                "not support input dim size") << input0->dim_size();
     Tensor::MappingGuard in0_guard(input0);
@@ -143,18 +137,28 @@ struct ScalarMathFunctor : OpKernel {
                           swapped,
                           out);
     }
-
-    SetFutureDefaultWaitFn(future);
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   EltwiseType type_;
   std::vector<float> coeff_;
   float scalar_input_;
   int32_t scalar_input_index_;
 };
 
+void RegisterScalarMath(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
+                   DeviceType::CPU, int32_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
+                   DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
+                   DeviceType::GPU, int32_t);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_SCALAR_MATH_H_
diff --git a/mace/ops/shape.h b/mace/kernels/shape.cc
similarity index 70%
rename from mace/ops/shape.h
rename to mace/kernels/shape.cc
index abb9ffb3..1775f0a0 100644
--- a/mace/ops/shape.h
+++ b/mace/kernels/shape.cc
@@ -12,23 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_SHAPE_H_
-#define MACE_OPS_SHAPE_H_
-
-#include <vector>
-
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
 template <DeviceType D, typename T>
-class ShapeOp : public Operator<D, T> {
+class ShapeOp : public Operation {
  public:
-  ShapeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context) {}
+  explicit ShapeOp(OpConstructContext *context)
+      : Operation(context) {}
 
-  MaceStatus Run(StatsFuture *future) override {
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
     const Tensor *input = this->Input(INPUT);
     Tensor *output = this->Output(OUTPUT);
     if (input->dim_size() > 0) {
@@ -40,7 +36,7 @@ class ShapeOp : public Operator<D, T> {
     int32_t *output_data = output->mutable_data<int32_t>();
 
     const int data_format =
-        OperatorBase::GetOptionalArg<int>("data_format", 0);
+        Operation::GetOptionalArg<int>("data_format", 0);
     if (input->dim_size() == 4 &&
         D == DeviceType::CPU &&
         data_format == DataFormat::NCHW) {
@@ -54,9 +50,8 @@ class ShapeOp : public Operator<D, T> {
         output_data[i] = static_cast<int32_t>(input->dim(i));
       }
     }
-    SetFutureDefaultWaitFn(future);
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
  private:
@@ -64,7 +59,16 @@ class ShapeOp : public Operator<D, T> {
   MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-}  // namespace ops
-}  // namespace mace
+void RegisterShape(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
+                   DeviceType::CPU, float);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
+                   DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 
-#endif  // MACE_OPS_SHAPE_H_
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.cc
similarity index 86%
rename from mace/kernels/softmax.h
rename to mace/kernels/softmax.cc
index 6afca75a..1ac3ab4d 100644
--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.cc
@@ -12,37 +12,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_SOFTMAX_H_
-#define MACE_KERNELS_SOFTMAX_H_
-
 #include <algorithm>
-#include <functional>
+#include <limits>
 #include <memory>
 #include <vector>
-#include <limits>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/core/operator.h"
 #include "mace/kernels/fixpoint.h"
 #include "mace/kernels/gemmlowp_util.h"
-#include "mace/kernels/kernel.h"
-#include "mace/kernels/quantize.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/softmax.h"
+#include "mace/kernels/opencl/buffer/softmax.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct SoftmaxFunctor;
+template <DeviceType D, typename T>
+class SoftmaxOp;
 
-template<>
-struct SoftmaxFunctor<DeviceType::CPU, float> : OpKernel {
-  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class SoftmaxOp<DeviceType::CPU, float> : public Operation {
+ public:
+  explicit SoftmaxOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
     const float *input_data = input->data<float>();
@@ -116,21 +115,24 @@ struct SoftmaxFunctor<DeviceType::CPU, float> : OpKernel {
     } else {
       MACE_NOT_IMPLEMENTED;
     }
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
 static const int kInputDeltaIntBits = 6;
 static const int kSumExpIntBits = 12;
 
-template<>
-struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
-  explicit SoftmaxFunctor(OpKernelContext *context) : OpKernel(context) {}
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class SoftmaxOp<DeviceType::CPU, uint8_t> : public Operation {
+ public:
+  explicit SoftmaxOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
     // Ignore range stat, fix range to [0, 1]. For large depth, each softmax
     // output may be too small (<<1), which causes precision issue. But it is
     // fine when doing classification inference.
@@ -186,7 +188,7 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
           output_ptr[d] = static_cast<uint8_t>(output_f * 255);
         }
       }
-      return MACE_SUCCESS;
+      return MaceStatus::MACE_SUCCESS;
     }
 
     int32_t scale_q = static_cast<int32_t>(std::min(
@@ -346,33 +348,51 @@ struct SoftmaxFunctor<DeviceType::CPU, uint8_t> : OpKernel {
         }
       }
     }
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLSoftmaxKernel {
+template <typename T>
+class SoftmaxOp<DeviceType::GPU, T> : public Operation {
  public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *logits,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSoftmaxKernel);
-};
-template<typename T>
-struct SoftmaxFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit SoftmaxFunctor(OpKernelContext *context);
-  MaceStatus operator()(const Tensor *logits,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit SoftmaxOp(OpConstructContext *context)
+      : Operation(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::SoftmaxKernel<T>);
+    } else {
+      kernel_.reset(new opencl::buffer::SoftmaxKernel<T>);
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
+
+    return kernel_->Compute(context, input, output);
+  }
 
+ private:
   std::unique_ptr<OpenCLSoftmaxKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterSoftmax(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
+                   DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
+                   DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Softmax", SoftmaxOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_SOFTMAX_H_
diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.cc
similarity index 79%
rename from mace/kernels/space_to_batch.h
rename to mace/kernels/space_to_batch.cc
index 337baefc..41c731c5 100644
--- a/mace/kernels/space_to_batch.h
+++ b/mace/kernels/space_to_batch.cc
@@ -12,33 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_SPACE_TO_BATCH_H_
-#define MACE_KERNELS_SPACE_TO_BATCH_H_
-
-#include <memory>
-#include <vector>
 #include <algorithm>
+#include <memory>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/space_to_batch.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-struct SpaceToBatchFunctorBase : OpKernel {
-  SpaceToBatchFunctorBase(OpKernelContext *context,
-                          const std::vector<int> &paddings,
-                          const std::vector<int> &block_shape)
-      : OpKernel(context),
-        paddings_(paddings.begin(), paddings.end()),
-        block_shape_(block_shape.begin(), block_shape.end()) {
+class SpaceToBatchOpBase : public Operation {
+ public:
+  explicit SpaceToBatchOpBase(OpConstructContext *context)
+      : Operation(context),
+        paddings_(Operation::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0})),
+        block_shape_(Operation::GetRepeatedArgs<int>("block_shape", {1, 1})) {
     MACE_CHECK(
-        block_shape.size() == 2 && block_shape[0] > 1 && block_shape[1] > 1,
+        block_shape_.size() == 2 && block_shape_[0] > 1 && block_shape_[1] > 1,
         "Block's shape should be 1D, and greater than 1");
-    MACE_CHECK(paddings.size() == 4, "Paddings' shape should be 2D");
+    MACE_CHECK(paddings_.size() == 4, "Paddings' shape should be 2D");
   }
 
+ protected:
   std::vector<int> paddings_;
   std::vector<int> block_shape_;
 
@@ -88,21 +85,19 @@ struct SpaceToBatchFunctorBase : OpKernel {
   }
 };
 
-template<DeviceType D, typename T>
-struct SpaceToBatchFunctor;
-
-template<>
-struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
-  SpaceToBatchFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
-      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
+template <DeviceType D, class T>
+class SpaceToBatchNDOp;
 
-  MaceStatus operator()(const Tensor *space_tensor,
-                        Tensor *batch_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class SpaceToBatchNDOp<DeviceType::CPU, float> : public SpaceToBatchOpBase {
+ public:
+  explicit SpaceToBatchNDOp(OpConstructContext *context)
+      : SpaceToBatchOpBase(context) {}
 
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *space_tensor = this->Input(0);
+    Tensor *batch_tensor = this->Output(0);
     std::vector<index_t> output_shape(4, 0);
 
     CalculateSpaceToBatchOutputShape(space_tensor,
@@ -197,22 +192,21 @@ struct SpaceToBatchFunctor<DeviceType::CPU, float> : SpaceToBatchFunctorBase {
         }  // b
       }  // block_h
     }  // c
-    return MACE_SUCCESS;
+
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
-template<>
-struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
-  SpaceToBatchFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape)
-      : SpaceToBatchFunctorBase(context, paddings, block_shape) {}
-
-  MaceStatus operator()(const Tensor *space_tensor,
-                        Tensor *batch_tensor,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <>
+class SpaceToBatchNDOp<DeviceType::CPU, uint8_t> : public SpaceToBatchOpBase {
+ public:
+  explicit SpaceToBatchNDOp(OpConstructContext *context)
+      : SpaceToBatchOpBase(context) {}
 
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *space_tensor = this->Input(0);
+    Tensor *batch_tensor = this->Output(0);
     std::vector<index_t> output_shape(4, 0);
 
     CalculateSpaceToBatchOutputShape(space_tensor,
@@ -302,38 +296,52 @@ struct SpaceToBatchFunctor<DeviceType::CPU, uint8_t> : SpaceToBatchFunctorBase {
                  * sizeof(uint8_t));
     }  // b
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLSpaceToBatchKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *space_tensor,
-      const std::vector<int> &paddings,
-      const std::vector<int> &block_shape,
-      const std::vector<index_t> &output_shape,
-      Tensor *batch_tensor,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToBatchKernel);
-};
 template <typename T>
-struct SpaceToBatchFunctor<DeviceType::GPU, T> : SpaceToBatchFunctorBase {
-  SpaceToBatchFunctor(OpKernelContext *context,
-                      const std::vector<int> &paddings,
-                      const std::vector<int> &block_shape);
-
-  MaceStatus operator()(const Tensor *space_tensor,
-                        Tensor *batch_tensor,
-                        StatsFuture *future);
+class SpaceToBatchNDOp<DeviceType::GPU, T> : public SpaceToBatchOpBase {
+ public:
+  explicit SpaceToBatchNDOp(OpConstructContext *context)
+      : SpaceToBatchOpBase(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::SpaceToBatchKernel<T>);
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *space_tensor = this->Input(0);
+    Tensor *batch_tensor = this->Output(0);
+    std::vector<index_t> output_shape(4, 0);
+    CalculateSpaceToBatchOutputShape(space_tensor, DataFormat::NHWC,
+                                     output_shape.data());
+    return kernel_->Compute(context, space_tensor, paddings_, block_shape_,
+                            output_shape, batch_tensor);
+  }
 
+ private:
   std::unique_ptr<OpenCLSpaceToBatchKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+void RegisterSpaceToBatchND(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
+                   SpaceToBatchNDOp, DeviceType::CPU, float);
+
+  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
+                   SpaceToBatchNDOp, DeviceType::CPU, uint8_t);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
+                   SpaceToBatchNDOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "SpaceToBatchND",
+                   SpaceToBatchNDOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_SPACE_TO_BATCH_H_
diff --git a/mace/kernels/space_to_depth.h b/mace/kernels/space_to_depth.cc
similarity index 60%
rename from mace/kernels/space_to_depth.h
rename to mace/kernels/space_to_depth.cc
index 2f379bbf..e2e302e6 100644
--- a/mace/kernels/space_to_depth.h
+++ b/mace/kernels/space_to_depth.cc
@@ -12,28 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_SPACE_TO_DEPTH_H_
-#define MACE_KERNELS_SPACE_TO_DEPTH_H_
 #include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/public/mace.h"
-#include "mace/kernels/kernel.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/space_to_depth.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct SpaceToDepthOpFunctor : OpKernel {
-  SpaceToDepthOpFunctor(OpKernelContext *context,
-                        const int block_size)
-      : OpKernel(context), block_size_(block_size) {}
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <DeviceType D, class T>
+class SpaceToDepthOp : public Operation {
+ public:
+  explicit SpaceToDepthOp(OpConstructContext *context)
+      : Operation(context),
+        block_size_(Operation::GetOptionalArg<int>("block_size", 1)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
     const index_t batch_size = input->dim(0);
     const index_t input_depth = input->dim(1);
     const index_t input_height = input->dim(2);
@@ -79,36 +80,50 @@ struct SpaceToDepthOpFunctor : OpKernel {
         }
       }
     }
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   const int block_size_;
 };
 
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLSpaceToDepthKernel {
+template <typename T>
+class SpaceToDepthOp<DeviceType::GPU, T> : public Operation {
  public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSpaceToDepthKernel);
-};
-template<typename T>
-struct SpaceToDepthOpFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit SpaceToDepthOpFunctor(OpKernelContext *context,
-                                 const int block_size);
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
+  explicit SpaceToDepthOp(OpConstructContext *context)
+      : Operation(context) {
+    int block_size = Operation::GetOptionalArg<int>("block_size", 1);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::SpaceToDepthKernel<T>(block_size));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
+    return kernel_->Compute(context, input, output);
+  }
 
+ private:
   std::unique_ptr<OpenCLSpaceToDepthKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+void RegisterSpaceToDepth(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
+                   SpaceToDepthOp, DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
+                   SpaceToDepthOp, DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "SpaceToDepth",
+                   SpaceToDepthOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_SPACE_TO_DEPTH_H_
diff --git a/mace/kernels/split.h b/mace/kernels/split.cc
similarity index 55%
rename from mace/kernels/split.h
rename to mace/kernels/split.cc
index ffef9699..68f5f274 100644
--- a/mace/kernels/split.h
+++ b/mace/kernels/split.cc
@@ -12,31 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_SPLIT_H_
-#define MACE_KERNELS_SPLIT_H_
-
-#include <memory>
 #include <functional>
-#include <vector>
+#include <memory>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/core/types.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
+#ifdef MACE_ENABLE_OPENCL
+#include "mace/kernels/opencl/image/split.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
-template<DeviceType D, typename T>
-struct SplitFunctor : OpKernel {
-  SplitFunctor(OpKernelContext *context, const int32_t axis)
-      : OpKernel(context), axis_(axis) {}
+template <DeviceType D, typename T>
+class SplitOp;
 
-  MaceStatus operator()(const Tensor *input,
-                        const std::vector<Tensor *> &output_list,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
+template <typename T>
+class SplitOp<DeviceType::CPU, T> : public Operation {
+ public:
+  explicit SplitOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 3)) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    MACE_CHECK(this->OutputSize() >= 2)
+      << "There must be at least two outputs for slicing";
+    const Tensor *input = this->Input(0);
+    const std::vector<Tensor *> output_list = this->Outputs();
+    MACE_CHECK((input->dim(axis_) % this->OutputSize()) == 0)
+      << "Outputs do not split input equally.";
     const index_t input_channels = input->dim(axis_);
     const size_t outputs_count = output_list.size();
     const index_t output_channels = input_channels / outputs_count;
@@ -74,35 +78,56 @@ struct SplitFunctor : OpKernel {
         input_idx += output_channels * inner_size;
       }
     }
-
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   int32_t axis_;
 };
 
+
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLSplitKernel {
+template <typename T>
+class SplitOp<DeviceType::GPU, T> : public Operation {
  public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      const std::vector<Tensor *> &output_list,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSplitKernel);
-};
-template<typename T>
-struct SplitFunctor<DeviceType::GPU, T> : OpKernel {
-  SplitFunctor(OpKernelContext *context, const int32_t axis);
+  explicit SplitOp(OpConstructContext *context)
+      : Operation(context) {
+    int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::SplitKernel<T>(axis));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    MACE_CHECK(this->OutputSize() >= 2)
+      << "There must be at least two outputs for slicing";
+    const Tensor *input = this->Input(0);
+    const std::vector<Tensor *> output_list = this->Outputs();
+    int32_t axis = Operation::GetOptionalArg<int>("axis", 3);
+    MACE_CHECK((input->dim(axis) % this->OutputSize()) == 0)
+      << "Outputs do not split input equally.";
+    return kernel_->Compute(context, input, output_list);
+  }
 
-  MaceStatus operator()(const Tensor *input,
-                        const std::vector<Tensor *> &output_list,
-                        StatsFuture *future);
+ private:
   std::unique_ptr<OpenCLSplitKernel> kernel_;
 };
 #endif  // MACE_ENABLE_OPENCL
 
+
+void RegisterSplit(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "Split", SplitOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_SPLIT_H_
diff --git a/mace/kernels/sqrdiff_mean.h b/mace/kernels/sqrdiff_mean.cc
similarity index 58%
rename from mace/kernels/sqrdiff_mean.h
rename to mace/kernels/sqrdiff_mean.cc
index 1c2d009c..e9c7bde0 100644
--- a/mace/kernels/sqrdiff_mean.h
+++ b/mace/kernels/sqrdiff_mean.cc
@@ -12,28 +12,45 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_SQRDIFF_MEAN_H_
-#define MACE_KERNELS_SQRDIFF_MEAN_H_
-
-#include <algorithm>
 #include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
+#include "mace/core/operator.h"
 #ifdef MACE_ENABLE_OPENCL
-#include "mace/core/runtime/opencl/cl2_header.h"
-#endif
+#include "mace/kernels/opencl/image/sqrdiff_mean.h"
+#endif  // MACE_ENABLE_OPENCL
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct SqrDiffMeanFunctor : OpKernel {
-  explicit SqrDiffMeanFunctor(OpKernelContext *context)
-  : OpKernel(context) {}
+class SqrDiffMeanOp : public Operation {
+ public:
+  explicit SqrDiffMeanOp(OpConstructContext *context)
+      : Operation(context) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input0 = this->Input(0);
+    const Tensor *input1 = this->Input(1);
+    Tensor *output = this->Output(0);
+
+    MACE_CHECK(input0->dim(0) == input1->dim(0) &&
+        input0->dim(1) == input1->dim(1),
+               "inputs dims N and C should be the same.");
 
+    std::vector<index_t> out_shape(4);
+    out_shape[0] = input0->dim(0);
+    out_shape[1] = input0->dim(1);
+    out_shape[2] = 1;
+    out_shape[3] = 1;
+
+    output->Resize(out_shape);
+    Compute(input0, input1, output);
+    return MaceStatus::MACE_SUCCESS;
+  }
+
+ private:
   void Compute(const Tensor *input0,
                const Tensor *input1,
                Tensor *output) {
@@ -56,54 +73,46 @@ struct SqrDiffMeanFunctor : OpKernel {
       output_ptr[i] /= img_size;
     }
   }
-
-  MaceStatus operator()(const Tensor *input0,
-                        const Tensor *input1,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
-
-    MACE_CHECK(input0->dim(0) == input1->dim(0) &&
-        input0->dim(1) == input1->dim(1),
-               "inputs dims N and C should be the same.");
-
-    std::vector<index_t> out_shape(4);
-    out_shape[0] = input0->dim(0);
-    out_shape[1] = input0->dim(1);
-    out_shape[2] = 1;
-    out_shape[3] = 1;
-
-    output->Resize(out_shape);
-    Compute(input0, input1, output);
-    return MACE_SUCCESS;
-  }
 };
 
+
 #ifdef MACE_ENABLE_OPENCL
-class OpenCLSqrDiffMeanKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input0,
-      const Tensor *input1,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLSqrDiffMeanKernel);
-};
 template <typename T>
-struct SqrDiffMeanFunctor<DeviceType::GPU, T> : OpKernel {
-  explicit SqrDiffMeanFunctor(OpKernelContext *context);
-
-  MaceStatus operator()(const Tensor *input0,
-                        const Tensor *input1,
-                        Tensor *output,
-                        StatsFuture *future);
+class SqrDiffMeanOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit SqrDiffMeanOp(OpConstructContext *context)
+      : Operation(context) {
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::SqrDiffMeanKernel<T>());
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input0 = this->Input(0);
+    const Tensor *input1 = this->Input(1);
+    Tensor *output = this->Output(0);
+    return kernel_->Compute(context, input0, input1, output);
+  }
 
+ private:
   std::unique_ptr<OpenCLSqrDiffMeanKernel> kernel_;
 };
-#endif
+#endif  // MACE_ENABLE_OPENCL
+
+
+void RegisterSqrDiffMean(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
+                   DeviceType::CPU, float);
+
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
+                   DeviceType::GPU, float);
+
+  MACE_REGISTER_OP(op_registry, "SqrDiffMean", SqrDiffMeanOp,
+                   DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_SQRDIFF_MEAN_H_
diff --git a/mace/ops/squeeze.h b/mace/kernels/squeeze.cc
similarity index 57%
rename from mace/ops/squeeze.h
rename to mace/kernels/squeeze.cc
index 7febfb0e..8221bccb 100644
--- a/mace/ops/squeeze.h
+++ b/mace/kernels/squeeze.cc
@@ -12,27 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_OPS_SQUEEZE_H_
-#define MACE_OPS_SQUEEZE_H_
-
-#include <vector>
 #include <unordered_set>
+#include <vector>
 
 #include "mace/core/operator.h"
 
 namespace mace {
-namespace ops {
+namespace kernels {
 
-template<DeviceType D, typename T>
-class SqueezeOp : public Operator<D, T> {
+template <DeviceType D, typename T>
+class SqueezeOp : public Operation {
  public:
-  SqueezeOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        axis_(OperatorBase::GetRepeatedArgs<int>("axis", {})) {}
+  explicit SqueezeOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetRepeatedArgs<int>("axis", {})) {}
 
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
 
     std::vector<index_t> output_shape;
     std::unordered_set<int> axis_set(axis_.begin(), axis_.end());
@@ -45,19 +43,21 @@ class SqueezeOp : public Operator<D, T> {
     output->ReuseTensorBuffer(*input);
     output->Reshape(output_shape);
 
-    SetFutureDefaultWaitFn(future);
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
  private:
   std::vector<int> axis_;
-
- private:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
-}  // namespace ops
-}  // namespace mace
+void RegisterSqueeze(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::CPU, uint8_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "Squeeze", SqueezeOp, DeviceType::GPU, half);
+#endif  // MACE_ENABLE_OPENCL
+}
 
-#endif  // MACE_OPS_SQUEEZE_H_
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/stack.h b/mace/kernels/stack.cc
similarity index 68%
rename from mace/kernels/stack.h
rename to mace/kernels/stack.cc
index 4d465784..b3fc8bea 100644
--- a/mace/kernels/stack.h
+++ b/mace/kernels/stack.cc
@@ -12,34 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_STACK_H_
-#define MACE_KERNELS_STACK_H_
-
 #include <algorithm>
-#include <functional>
-#include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct StackFunctor : OpKernel {
-  StackFunctor(OpKernelContext *context, int axis)
-      : OpKernel(context), axis_(axis) {}
+class StackOp : public Operation {
+ public:
+  explicit StackOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 0)) {}
 
-  MaceStatus operator()(const std::vector<const Tensor *> &inputs,
-                        Tensor *output,
-                        StatsFuture *future) {
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const std::vector<const Tensor *> &inputs = this->Inputs();
+    Tensor *output = this->Output(0);
     MACE_CHECK(!inputs.empty(), "stack inputs are empty.");
     std::vector<index_t> input_shape = inputs[0]->shape();
     MACE_CHECK(axis_ >= -(inputs[0]->dim_size() + 1) &&
-                   axis_ < inputs[0]->dim_size() + 1,
+        axis_ < inputs[0]->dim_size() + 1,
                "axis out of bound.");
     if (axis_ < 0) {
       axis_ += inputs[0]->dim_size() + 1;
@@ -48,14 +43,14 @@ struct StackFunctor : OpKernel {
     output_shape.insert(output_shape.begin() + axis_, inputs.size());
     MACE_RETURN_IF_ERROR(output->Resize(output_shape));
 
-    // Some inputs may be in gpu memory, so add mapping here.
+    // Some inputs_ may be in gpu memory, so add mapping here.
     std::vector<Tensor::MappingGuard> mappers;
     for (size_t i = 0; i < inputs.size(); ++i) {
       mappers.emplace_back(Tensor::MappingGuard(inputs[i]));
     }
 
     // Output is on host, no need to map data
-    T *output_data = output->mutable_data<T>();
+    auto *output_data = output->mutable_data<T>();
     std::vector<const T *> input_data(inputs.size());
     for (size_t i = 0; i < inputs.size(); ++i) {
       input_data[i] = inputs[i]->data<T>();
@@ -74,15 +69,21 @@ struct StackFunctor : OpKernel {
         output_data += low_dim_elem_size;
       }
     }
-
-    SetFutureDefaultWaitFn(future);
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   int axis_;
 };
 
+void RegisterStack(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, int32_t);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_STACK_H_
diff --git a/mace/kernels/strided_slice.h b/mace/kernels/strided_slice.cc
similarity index 77%
rename from mace/kernels/strided_slice.h
rename to mace/kernels/strided_slice.cc
index 3f1f2c49..b030661b 100644
--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.cc
@@ -12,49 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_STRIDED_SLICE_H_
-#define MACE_KERNELS_STRIDED_SLICE_H_
-
 #include <algorithm>
-#include <cmath>
-#include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct StridedSliceFunctor : OpKernel {
-  StridedSliceFunctor(OpKernelContext *context,
-                      int begin_mask,
-                      int end_mask,
-                      int ellipsis_mask,
-                      int new_axis_mask,
-                      int shrink_axis_mask,
-                      bool is_slice)
-      : OpKernel(context),
-        begin_mask_(begin_mask),
-        end_mask_(end_mask),
-        ellipsis_mask_(ellipsis_mask),
-        new_axis_mask_(new_axis_mask),
-        shrink_axis_mask_(shrink_axis_mask),
-        is_slice_(is_slice),
-        tmp_strides_tensor_(context->device()->allocator(),
-                            DataTypeToEnum<int32_t>::v()) {}
-
-  MaceStatus operator()(const Tensor *input,
-                        const Tensor *begin_indices,
-                        const Tensor *end_indices,
-                        const Tensor *strides,
-                        Tensor *output,
-                        StatsFuture *future) {
+class StridedSliceOp : public Operation {
+ public:
+  explicit StridedSliceOp(OpConstructContext *context)
+      : Operation(context),
+        begin_mask_(Operation::GetOptionalArg<int>("begin_mask", 0)),
+        end_mask_(Operation::GetOptionalArg<int>("end_mask", 0)),
+        ellipsis_mask_(Operation::GetOptionalArg<int>("ellipsis_mask", 0)),
+        new_axis_mask_(Operation::GetOptionalArg<int>("new_axis_mask", 0)),
+        shrink_axis_mask_(
+            Operation::GetOptionalArg<int>("shrink_axis_mask", 0)),
+        is_slice_(Operation::GetOptionalArg<bool>("slice", false)) {
     MACE_CHECK(ellipsis_mask_ == 0 && new_axis_mask_ == 0,
                "ellipsis_mask and new_axis_mask are not supported yet.");
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(INPUT);
+    const Tensor *begin_indices = this->Input(BEGIN);
+    const Tensor *end_indices = this->Input(END);
+    const Tensor *strides = nullptr;
+    if (this->InputSize() > 3) {
+      strides = this->Input(STRIDES);
+    }
+    Tensor *output = this->Output(OUTPUT);
     if (strides == nullptr) {
       tmp_strides_tensor_.Resize({begin_indices->size()});
       Tensor::MappingGuard strides_guard(&tmp_strides_tensor_);
@@ -118,16 +109,16 @@ struct StridedSliceFunctor : OpKernel {
       } else {
         real_end_indices[d] =
             end_indices_data[d] < -dim_len
-                ? -1
-                : (end_indices_data[d] < 0
-                       ? (end_indices_data[d] + dim_len)
-                       : std::min(static_cast<index_t>(end_indices_data[d]),
-                                  dim_len));
+            ? -1
+            : (end_indices_data[d] < 0
+               ? (end_indices_data[d] + dim_len)
+               : std::min(static_cast<index_t>(end_indices_data[d]),
+                          dim_len));
       }
 
       int32_t out_dim_len = std::max(
           0.f, std::ceil((real_end_indices[d] - real_begin_indices[d]) /
-                         static_cast<float>(strides_data[d])));
+              static_cast<float>(strides_data[d])));
       if (!(shrink_axis_mask_ & (1 << d))) {
         output_shape.push_back(out_dim_len);
       } else {
@@ -197,7 +188,7 @@ struct StridedSliceFunctor : OpKernel {
                                      : k > real_end_indices[2];
                  k += strides_data[2]) {
               *output_data++ =
-                input_data[(i * input->dim(1) + j) * input->dim(2) + k];
+                  input_data[(i * input->dim(1) + j) * input->dim(2) + k];
             }
           }
         }
@@ -205,11 +196,10 @@ struct StridedSliceFunctor : OpKernel {
         MACE_NOT_IMPLEMENTED;
       }
     }
-
-    SetFutureDefaultWaitFn(future);
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   int begin_mask_;
   int end_mask_;
   int ellipsis_mask_;
@@ -217,9 +207,23 @@ struct StridedSliceFunctor : OpKernel {
   int shrink_axis_mask_;
   bool is_slice_;
   Tensor tmp_strides_tensor_;
+
+  MACE_OP_INPUT_TAGS(INPUT, BEGIN, END, STRIDES);
+  MACE_OP_OUTPUT_TAGS(OUTPUT);
 };
 
+void RegisterStridedSlice(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
+                   DeviceType::CPU, int32_t);
+#ifdef MACE_ENABLE_OPENCL
+  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
+                   DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
+                   DeviceType::GPU, int32_t);
+#endif  // MACE_ENABLE_OPENCL
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_STRIDED_SLICE_H_
diff --git a/mace/kernels/transpose.h b/mace/kernels/transpose.cc
similarity index 88%
rename from mace/kernels/transpose.h
rename to mace/kernels/transpose.cc
index 04e1caed..2ec38015 100644
--- a/mace/kernels/transpose.h
+++ b/mace/kernels/transpose.cc
@@ -12,9 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_TRANSPOSE_H_
-#define MACE_KERNELS_TRANSPOSE_H_
-
 #if defined(MACE_ENABLE_NEON)
 #include <arm_neon.h>
 #endif
@@ -22,10 +19,7 @@
 #include <vector>
 #include <algorithm>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/public/mace.h"
-#include "mace/utils/utils.h"
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
@@ -104,19 +98,29 @@ static void TransposeNCHWToNHWCC2(const float *input,
   }
 }
 
-template<DeviceType D, typename T>
-struct TransposeFunctor : OpKernel {
-  TransposeFunctor(OpKernelContext *context, const std::vector<int> &dims)
-      : OpKernel(context), dims_(dims) {}
+template <DeviceType D, typename T>
+class TransposeOp : public Operation {
+ public:
+  explicit TransposeOp(OpConstructContext *context)
+      : Operation(context),
+        dims_(Operation::GetRepeatedArgs<int>("dims")) {}
+
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    Tensor *output = this->Output(0);
+    const std::vector<index_t> &input_shape = input->shape();
+    MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4) ||
+        (input_shape.size() == 2 && dims_.size() == 2),
+               "rank should be 2 or 4");
+    std::vector<index_t> output_shape;
+    for (size_t i = 0; i < dims_.size(); ++i) {
+      output_shape.push_back(input_shape[dims_[i]]);
+    }
+    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
 
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future) {
-    MACE_UNUSED(future);
     Tensor::MappingGuard input_guard(input);
     Tensor::MappingGuard output_guard(output);
-    const std::vector<index_t> &input_shape = input->shape();
-    const std::vector<index_t> &output_shape = output->shape();
     const T *input_data = input->data<T>();
     T *output_data = output->mutable_data<T>();
 
@@ -216,13 +220,17 @@ struct TransposeFunctor : OpKernel {
       MACE_NOT_IMPLEMENTED;
     }
 
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   std::vector<int> dims_;
 };
 
+void RegisterTranspose(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Transpose", TransposeOp,
+                   DeviceType::CPU, float);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_TRANSPOSE_H_
diff --git a/mace/kernels/unstack.h b/mace/kernels/unstack.cc
similarity index 76%
rename from mace/kernels/unstack.h
rename to mace/kernels/unstack.cc
index b193c6b5..8403b8f6 100644
--- a/mace/kernels/unstack.h
+++ b/mace/kernels/unstack.cc
@@ -12,30 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_UNSTACK_H_
-#define MACE_KERNELS_UNSTACK_H_
-
 #include <algorithm>
-#include <functional>
-#include <memory>
 #include <vector>
 
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/kernel.h"
-#include "mace/public/mace.h"
+#include "mace/core/operator.h"
 
 namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-struct UnstackFunctor : OpKernel {
-  UnstackFunctor(OpKernelContext *context, int axis)
-      : OpKernel(context), axis_(axis) {}
+class UnstackOp : public Operation {
+ public:
+  explicit UnstackOp(OpConstructContext *context)
+      : Operation(context),
+        axis_(Operation::GetOptionalArg<int>("axis", 0)) {}
 
-  MaceStatus operator()(const Tensor *input,
-                        const std::vector<Tensor *> &outputs,
-                        StatsFuture *future) {
+  MaceStatus Run(OpContext *context) override {
+    MACE_UNUSED(context);
+    const Tensor *input = this->Input(0);
+    const std::vector<Tensor *> outputs = this->Outputs();
     std::vector<index_t> input_shape = input->shape();
     MACE_CHECK(axis_ >= -(input->dim_size()) && axis_ < input->dim_size(),
                "axis out of bound.");
@@ -71,15 +66,19 @@ struct UnstackFunctor : OpKernel {
         input_idx += low_dim_elem_size;
       }
     }
-
-    SetFutureDefaultWaitFn(future);
-    return MACE_SUCCESS;
+    return MaceStatus::MACE_SUCCESS;
   }
 
+ private:
   int axis_;
 };
 
+void RegisterUnstack(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
+                   DeviceType::CPU, float);
+  MACE_REGISTER_OP(op_registry, "Unstack", UnstackOp,
+                   DeviceType::CPU, int32_t);
+}
+
 }  // namespace kernels
 }  // namespace mace
-
-#endif  // MACE_KERNELS_UNSTACK_H_
diff --git a/mace/kernels/winograd_transform.cc b/mace/kernels/winograd_transform.cc
new file mode 100644
index 00000000..286bff95
--- /dev/null
+++ b/mace/kernels/winograd_transform.cc
@@ -0,0 +1,102 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "mace/core/operator.h"
+#include "mace/kernels/activation.h"
+#include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/opencl/image/winograd_transform.h"
+
+namespace mace {
+namespace kernels {
+
+template <DeviceType D, typename T>
+class WinogradTransformOp;
+
+template <typename T>
+class WinogradTransformOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit WinogradTransformOp(OpConstructContext *context)
+      : Operation(context) {
+    Padding padding_type = static_cast<Padding>(Operation::GetOptionalArg<int>(
+        "padding", static_cast<int>(VALID)));
+    std::vector<int> paddings = Operation::GetRepeatedArgs<int>(
+        "padding_values");
+    int block_size = Operation::GetOptionalArg<int>("wino_block_size", 2);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::WinogradTransformKernel<T>(
+          padding_type, paddings, block_size));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    const Tensor *input_tensor = this->Input(0);
+    Tensor *output_tensor = this->Output(0);
+    return kernel_->Compute(context, input_tensor, output_tensor);
+  }
+
+ private:
+  std::unique_ptr<OpenCLWinogradTransformKernel> kernel_;
+};
+
+template <DeviceType D, typename T>
+class WinogradInverseTransformOp;
+
+template <typename T>
+class WinogradInverseTransformOp<DeviceType::GPU, T> : public Operation {
+ public:
+  explicit WinogradInverseTransformOp(OpConstructContext *context)
+      : Operation(context) {
+    ActivationType activation = kernels::StringToActivationType(
+        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
+    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    int block_size = Operation::GetOptionalArg<int>("wino_block_size", 2);
+    if (context->device()->opencl_runtime()->UseImageMemory()) {
+      kernel_.reset(new opencl::image::WinogradInverseTransformKernel<T>(
+          activation, relux_max_limit, block_size));
+    } else {
+      MACE_NOT_IMPLEMENTED;
+    }
+  }
+
+  MaceStatus Run(OpContext *context) override {
+    Tensor *output_tensor = this->Output(0);
+    return kernel_->Compute(context, inputs_, output_tensor);
+  }
+
+ private:
+  std::unique_ptr<OpenCLWinogradInverseTransformKernel> kernel_;
+};
+
+void RegisterWinogradTransform(OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "WinogradTransform",
+                   WinogradTransformOp, DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "WinogradTransform",
+                   WinogradTransformOp, DeviceType::GPU, half);
+}
+
+void RegisterWinogradInverseTransform(
+    OpRegistryBase *op_registry) {
+  MACE_REGISTER_OP(op_registry, "WinogradInverseTransform",
+                   WinogradInverseTransformOp, DeviceType::GPU, float);
+  MACE_REGISTER_OP(op_registry, "WinogradInverseTransform",
+                   WinogradInverseTransformOp, DeviceType::GPU, half);
+}
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
deleted file mode 100644
index 31364598..00000000
--- a/mace/kernels/winograd_transform.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_KERNELS_WINOGRAD_TRANSFORM_H_
-#define MACE_KERNELS_WINOGRAD_TRANSFORM_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/future.h"
-#include "mace/core/tensor.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/conv_pool_2d_util.h"
-
-namespace mace {
-namespace kernels {
-
-template <DeviceType D, typename T>
-struct WinogradTransformFunctor;
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLWinogradTransformKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const Tensor *input,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradTransformKernel);
-};
-template<typename T>
-struct WinogradTransformFunctor<DeviceType::GPU, T> : OpKernel {
-  WinogradTransformFunctor(OpKernelContext *context,
-                           const Padding &padding_type,
-                           const std::vector<int> &paddings,
-                           const int block_size);
-
-  MaceStatus operator()(const Tensor *input,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLWinogradTransformKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
-
-template<DeviceType D, typename T>
-struct WinogradInverseTransformFunctor;
-
-#ifdef MACE_ENABLE_OPENCL
-class OpenCLWinogradInverseTransformKernel {
- public:
-  virtual MaceStatus Compute(
-      OpKernelContext *context,
-      const std::vector<const Tensor*> &inputs,
-      Tensor *output,
-      StatsFuture *future) = 0;
-  MACE_VIRTUAL_EMPTY_DESTRUCTOR(OpenCLWinogradInverseTransformKernel);
-};
-template <typename T>
-struct WinogradInverseTransformFunctor<DeviceType::GPU, T> : OpKernel {
-  WinogradInverseTransformFunctor(OpKernelContext *context,
-                                  const ActivationType activation,
-                                  const float relux_max_limit,
-                                  const int block_size);
-
-  MaceStatus operator()(const std::vector<const Tensor *> &inputs,
-                        Tensor *output,
-                        StatsFuture *future);
-
-  std::unique_ptr<OpenCLWinogradInverseTransformKernel> kernel_;
-};
-#endif  // MACE_ENABLE_OPENCL
-
-}  // namespace kernels
-}  // namespace mace
-
-#endif  // MACE_KERNELS_WINOGRAD_TRANSFORM_H_
diff --git a/mace/libmace/BUILD b/mace/libmace/BUILD
index ccde07e1..bc44c109 100644
--- a/mace/libmace/BUILD
+++ b/mace/libmace/BUILD
@@ -40,6 +40,7 @@ cc_library(
     deps = [
         "//mace/public",
         "//mace/ops",
+        "//mace/kernels",
     ],
     alwayslink = 1,
 )
@@ -49,18 +50,24 @@ cc_binary(
     linkopts = [
         "-Wl,-soname,libmace.so",
         "-Wl,--version-script",
-        "mace_version_script.lds",
+        "$(location //mace/libmace:mace_version_script.lds)",
     ] + if_openmp_enabled([
         "-fopenmp",
     ]),
     linkshared = 1,
     linkstatic = 1,
     deps = [
-        ":mace_version_script.lds",
+        "//mace/libmace:mace_version_script.lds",
         "//mace/libmace",
     ],
 )
 
+exports_files(
+    [
+        "mace_version_script.lds",
+    ],
+)
+
 cc_library(
     name = "libmace_dynamic",
     srcs = ["libmace.so"],
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 6e1b44d8..52584abb 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -22,7 +22,8 @@
 
 #include "mace/core/net.h"
 #include "mace/core/device_context.h"
-#include "mace/ops/ops_register.h"
+#include "mace/kernels/ops_register.h"
+#include "mace/ops/ops_def_register.h"
 #include "mace/public/mace.h"
 
 #ifdef MACE_ENABLE_OPENCL
@@ -237,7 +238,7 @@ MaceEngineConfig::Impl::Impl(const DeviceType device_type)
 MaceStatus MaceEngineConfig::Impl::SetGPUContext(
     std::shared_ptr<GPUContext> context) {
   gpu_context_ = context;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus MaceEngineConfig::Impl::SetGPUHints(
@@ -245,7 +246,7 @@ MaceStatus MaceEngineConfig::Impl::SetGPUHints(
     GPUPriorityHint priority_hint) {
   gpu_perf_hint_ = perf_hint;
   gpu_priority_hint_ = priority_hint;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
@@ -255,7 +256,7 @@ MaceStatus MaceEngineConfig::Impl::SetCPUThreadPolicy(
   num_threads_ = num_threads;
   cpu_affinity_policy_ = policy;
   use_gemmlowp_ = use_gemmlowp;
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 
@@ -358,7 +359,8 @@ class MaceEngine::Impl {
  private:
   const unsigned char *model_data_;
   size_t model_data_size_;
-  std::shared_ptr<OperatorRegistryBase> op_registry_;
+  std::unique_ptr<OpDefRegistryBase> op_def_registry_;
+  std::unique_ptr<OpRegistryBase> op_registry_;
   DeviceType device_type_;
   std::unique_ptr<Device> device_;
   std::unique_ptr<Workspace> ws_;
@@ -375,7 +377,8 @@ class MaceEngine::Impl {
 MaceEngine::Impl::Impl(const MaceEngineConfig &config)
     : model_data_(nullptr),
       model_data_size_(0),
-      op_registry_(new OperatorRegistry()),
+      op_def_registry_(new OpDefRegistry()),
+      op_registry_(new OpRegistry),
       device_type_(config.impl_->device_type()),
       device_(nullptr),
       ws_(new Workspace()),
@@ -462,10 +465,21 @@ MaceStatus MaceEngine::Impl::Init(
                                               model_data));
 
     // Init model
-    auto net = CreateNet(op_registry_, *net_def, ws_.get(), device_.get(),
-                         NetMode::INIT);
+    auto net = std::unique_ptr<NetBase>(new SerialNet(
+        op_def_registry_.get(),
+        op_registry_.get(),
+        net_def,
+        ws_.get(),
+        device_.get(),
+        NetMode::INIT));
+    MACE_RETURN_IF_ERROR(net->Init());
     MACE_RETURN_IF_ERROR(net->Run());
-    net_ = CreateNet(op_registry_, *net_def, ws_.get(), device_.get());
+    net_ = std::unique_ptr<NetBase>(new SerialNet(op_def_registry_.get(),
+                                                  op_registry_.get(),
+                                                  net_def,
+                                                  ws_.get(),
+                                                  device_.get()));
+    MACE_RETURN_IF_ERROR(net_->Init());
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
@@ -563,6 +577,7 @@ MaceStatus MaceEngine::Impl::Run(
 
 #ifdef MACE_ENABLE_OPENCL
   if (device_type_ == GPU) {
+    device_->opencl_runtime()->command_queue().finish();
     device_->opencl_runtime()->SaveBuiltCLProgram();
   }
 #endif
@@ -582,10 +597,10 @@ MaceStatus MaceEngine::Impl::Run(
       std::memcpy(output.second.data().get(), output_tensor->data<float>(),
                   output_size * sizeof(float));
     } else {
-      return MACE_INVALID_ARGS;
+      return MaceStatus::MACE_INVALID_ARGS;
     }
   }
-  return MACE_SUCCESS;
+  return MaceStatus::MACE_SUCCESS;
 }
 
 MaceEngine::MaceEngine(const MaceEngineConfig &config):
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index 54a885ab..d039f8c8 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -41,34 +41,20 @@ cc_library(
         "-DMACE_ENABLE_HEXAGON",
     ]),
     deps = [
-        "//mace/ops",
+        "ops",
+        "//mace/kernels",
         "@gtest",
     ],
 )
 
 cc_library(
     name = "ops",
-    srcs = glob(
-        ["*.cc"],
-        exclude = [
-            "*_test.cc",
-            "*_benchmark.cc",
-            "ops_test_util.cc",
-            "buffer_transform.cc",
-            "buffer_inverse_transform.cc",
-            "lstmcell.cc",
-        ],
-    ) + if_opencl_enabled(
-        [
-            "buffer_transform.cc",
-            "buffer_inverse_transform.cc",
-            "lstmcell.cc",
-        ],
-    ),
-    hdrs = glob(
-        ["*.h"],
-        exclude = glob(["*_test_util.h"]),
-    ),
+    srcs = [
+        "ops_def_register.cc",
+    ],
+    hdrs = [
+        "ops_def_register.h",
+    ],
     copts = [
         "-Werror",
         "-Wextra",
@@ -84,7 +70,7 @@ cc_library(
         "-DMACE_ENABLE_HEXAGON",
     ]),
     deps = [
-        "//mace/kernels",
+        "//mace/core",
     ],
 )
 
diff --git a/mace/ops/activation.cc b/mace/ops/activation.cc
deleted file mode 100644
index 44b2ba90..00000000
--- a/mace/ops/activation.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/activation.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Activation(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ActivationOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ActivationOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Activation")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         ActivationOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/activation.h b/mace/ops/activation.h
deleted file mode 100644
index 3b48891e..00000000
--- a/mace/ops/activation.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ACTIVATION_H_
-#define MACE_OPS_ACTIVATION_H_
-
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class ActivationOp : public Operator<D, T> {
- public:
-  ActivationOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 static_cast<T>(
-                     OperatorBase::GetOptionalArg<float>("max_limit", 0.0f))) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input_tensor = this->Input(0);
-    const Tensor *alpha_tensor =
-        this->InputSize() >= 2 ? this->Input(1) : nullptr;
-    Tensor *output_tensor = this->Output(0);
-    MACE_RETURN_IF_ERROR(output_tensor->ResizeLike(input_tensor));
-
-    return functor_(input_tensor, alpha_tensor, output_tensor, future);
-  }
-
- private:
-  kernels::ActivationFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ACTIVATION_H_
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index 341b5f71..1f16879f 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index 49422f3a..01735e97 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/addn.cc b/mace/ops/addn.cc
deleted file mode 100644
index a30cba48..00000000
--- a/mace/ops/addn.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/addn.h"
-
-namespace mace {
-namespace ops {
-
-void Register_AddN(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         AddNOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         AddNOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("AddN")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         AddNOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/addn.h b/mace/ops/addn.h
deleted file mode 100644
index 4238a013..00000000
--- a/mace/ops/addn.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ADDN_H_
-#define MACE_OPS_ADDN_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/addn.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class AddNOp : public Operator<D, T> {
- public:
-  AddNOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context), functor_(context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    Tensor *output_tensor = this->Output(0);
-    int n = this->inputs_.size();
-    std::vector<const Tensor *> inputs(n, nullptr);
-    inputs[0] = this->Input(0);
-    for (int i = 1; i < n; ++i) {
-      inputs[i] = this->Input(i);
-      MACE_CHECK(inputs[0]->dim_size() == inputs[i]->dim_size());
-      MACE_CHECK(inputs[0]->size() == inputs[i]->size())
-          << "Input 0: " << MakeString(inputs[0]->shape())
-          << ", size: " << inputs[0]->size() << ". Input " << i << ": "
-          << MakeString(inputs[i]->shape()) << ", size: " << inputs[i]->size();
-    }
-    return functor_(inputs, output_tensor, future);
-  }
-
- private:
-  kernels::AddNFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ADDN_H_
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index 1b72c791..a155d854 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/addn_test.cc b/mace/ops/addn_test.cc
index 7154ad52..5116e36b 100644
--- a/mace/ops/addn_test.cc
+++ b/mace/ops/addn_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/argmax.cc b/mace/ops/argmax.cc
deleted file mode 100644
index e14b7bb8..00000000
--- a/mace/ops/argmax.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/argmax.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ArgMax(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ArgMax")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ArgMaxOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/argmax.h b/mace/ops/argmax.h
deleted file mode 100644
index b1d7ec4e..00000000
--- a/mace/ops/argmax.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ARGMAX_H_
-#define MACE_OPS_ARGMAX_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/argmax.h"
-
-namespace mace {
-namespace ops {
-
-template<DeviceType D, class T>
-class ArgMaxOp : public Operator<D, T> {
- public:
-  ArgMaxOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context), functor_(context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(0);
-    const Tensor *axis = this->Input(1);
-    Tensor *output = this->Output(0);
-    return functor_(input, axis, output, future);
-  }
-
- private:
-  kernels::ArgMaxFunctor<D, T> functor_;
-
-  MACE_OP_INPUT_TAGS(INPUT, AXIS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ARGMAX_H_
diff --git a/mace/ops/argmax_test.cc b/mace/ops/argmax_test.cc
index ca7ece35..06de7046 100644
--- a/mace/ops/argmax_test.cc
+++ b/mace/ops/argmax_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/batch_norm.cc b/mace/ops/batch_norm.cc
deleted file mode 100644
index c1a6c0cf..00000000
--- a/mace/ops/batch_norm.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/batch_norm.h"
-
-namespace mace {
-namespace ops {
-
-void Register_BatchNorm(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BatchNormOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BatchNormOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchNorm")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         BatchNormOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/batch_norm.h b/mace/ops/batch_norm.h
deleted file mode 100644
index 7221c3ca..00000000
--- a/mace/ops/batch_norm.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_BATCH_NORM_H_
-#define MACE_OPS_BATCH_NORM_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/batch_norm.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class BatchNormOp : public Operator<D, T> {
- public:
-  BatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context, false, kernels::ActivationType::NOOP, 0.0f) {
-    epsilon_ = OperatorBase::GetOptionalArg<float>("epsilon",
-                                                   static_cast<float>(1e-4));
-  }
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *scale = this->Input(SCALE);
-    const Tensor *offset = this->Input(OFFSET);
-    const Tensor *mean = this->Input(MEAN);
-    const Tensor *var = this->Input(VAR);
-
-    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
-               input->dim_size());
-    MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
-               scale->dim_size());
-    MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
-               offset->dim_size());
-    MACE_CHECK(mean->dim_size() == 1, "mean must be 1-dimensional. ",
-               mean->dim_size());
-    MACE_CHECK(var->dim_size() == 1, "var must be 1-dimensional. ",
-               var->dim_size());
-
-    Tensor *output = this->Output(OUTPUT);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-    return functor_(input, scale, offset,
-                    mean, var, epsilon_, output, future);
-  }
-
- private:
-  float epsilon_;
-  kernels::BatchNormFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_BATCH_NORM_H_
diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc
index 648ddfca..c390860e 100644
--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc
index 7d5b77da..3c22d5ff 100644
--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/batch_to_space.cc b/mace/ops/batch_to_space.cc
deleted file mode 100644
index 103e1297..00000000
--- a/mace/ops/batch_to_space.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/batch_to_space.h"
-
-namespace mace {
-namespace ops {
-
-void Register_BatchToSpaceND(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BatchToSpaceNDOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         BatchToSpaceNDOp<DeviceType::CPU, uint8_t>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BatchToSpaceNDOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BatchToSpaceND")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         BatchToSpaceNDOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/batch_to_space.h b/mace/ops/batch_to_space.h
deleted file mode 100644
index 458db284..00000000
--- a/mace/ops/batch_to_space.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_BATCH_TO_SPACE_H_
-#define MACE_OPS_BATCH_TO_SPACE_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/batch_to_space.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class BatchToSpaceNDOp : public Operator<D, T> {
- public:
-  BatchToSpaceNDOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 OperatorBase::GetRepeatedArgs<int>("crops", {0, 0, 0, 0}),
-                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1})) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *batch_tensor = this->Input(INPUT);
-    Tensor *space_tensor = this->Output(OUTPUT);
-    return functor_(batch_tensor, space_tensor, future);
-  }
-
- private:
-  kernels::BatchToSpaceFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_BATCH_TO_SPACE_H_
diff --git a/mace/ops/batch_to_space_benchmark.cc b/mace/ops/batch_to_space_benchmark.cc
index c6b3e25a..4cf55b33 100644
--- a/mace/ops/batch_to_space_benchmark.cc
+++ b/mace/ops/batch_to_space_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/bias_add.cc b/mace/ops/bias_add.cc
deleted file mode 100644
index bf082cf9..00000000
--- a/mace/ops/bias_add.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/bias_add.h"
-
-namespace mace {
-namespace ops {
-
-void Register_BiasAdd(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BiasAddOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BiasAddOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BiasAdd")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         BiasAddOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/bias_add.h b/mace/ops/bias_add.h
deleted file mode 100644
index ee3de991..00000000
--- a/mace/ops/bias_add.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_BIAS_ADD_H_
-#define MACE_OPS_BIAS_ADD_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/bias_add.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class BiasAddOp : public Operator<D, T> {
- public:
-  BiasAddOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
-                     "data_format", NHWC))) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *bias = this->Input(BIAS);
-
-    MACE_CHECK(bias->dim_size() == 1, "bias must be 1-dimensional. ",
-               bias->dim_size());
-
-    Tensor *output = this->Output(OUTPUT);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-    return functor_(input, bias, output, future);
-  }
-
- private:
-  kernels::BiasAddFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, BIAS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_BIAS_ADD_H_
diff --git a/mace/ops/bias_add_benchmark.cc b/mace/ops/bias_add_benchmark.cc
index ca8500ed..5908caa2 100644
--- a/mace/ops/bias_add_benchmark.cc
+++ b/mace/ops/bias_add_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/bias_add_test.cc b/mace/ops/bias_add_test.cc
index 51c8cc88..771065c2 100644
--- a/mace/ops/bias_add_test.cc
+++ b/mace/ops/bias_add_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/buffer_inverse_transform.cc b/mace/ops/buffer_inverse_transform.cc
deleted file mode 100644
index af52d482..00000000
--- a/mace/ops/buffer_inverse_transform.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/buffer_inverse_transform.h"
-
-namespace mace {
-namespace ops {
-
-void Register_BufferInverseTransform(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BufferInverseTransformOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferInverseTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         BufferInverseTransformOp<DeviceType::GPU, half>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/buffer_inverse_transform.h b/mace/ops/buffer_inverse_transform.h
deleted file mode 100644
index 9eefb0f0..00000000
--- a/mace/ops/buffer_inverse_transform.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
-#define MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/buffer_inverse_transform.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class BufferInverseTransformOp : public Operator<D, T> {
- public:
-  BufferInverseTransformOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-
-    kernels::BufferType type =
-        static_cast<kernels::BufferType>(OperatorBase::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
-    return functor_(input, type, output, future);
-  }
-
- private:
-  kernels::BufferInverseTransformFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_BUFFER_INVERSE_TRANSFORM_H_
diff --git a/mace/ops/buffer_to_image_benchmark.cc b/mace/ops/buffer_to_image_benchmark.cc
index 7d94c525..fb1cf51c 100644
--- a/mace/ops/buffer_to_image_benchmark.cc
+++ b/mace/ops/buffer_to_image_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/buffer_transform.cc b/mace/ops/buffer_transform.cc
deleted file mode 100644
index bab1b894..00000000
--- a/mace/ops/buffer_transform.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/buffer_transform.h"
-
-namespace mace {
-namespace ops {
-
-void Register_BufferTransform(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         BufferTransformOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("BufferTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         BufferTransformOp<DeviceType::GPU, half>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/buffer_transform.h b/mace/ops/buffer_transform.h
deleted file mode 100644
index 94a4779f..00000000
--- a/mace/ops/buffer_transform.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_BUFFER_TRANSFORM_H_
-#define MACE_OPS_BUFFER_TRANSFORM_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/buffer_transform.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class BufferTransformOp : public Operator<D, T> {
- public:
-  BufferTransformOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input_tensor = this->Input(INPUT);
-
-    kernels::BufferType type =
-        static_cast<kernels::BufferType>(OperatorBase::GetOptionalArg<int>(
-            "buffer_type", static_cast<int>(kernels::CONV2D_FILTER)));
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input_tensor, type, output, future);
-  }
-
- private:
-  kernels::BufferTransformFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-#endif  // MACE_OPS_BUFFER_TRANSFORM_H_
diff --git a/mace/ops/cast.cc b/mace/ops/cast.cc
deleted file mode 100644
index 87abfdd4..00000000
--- a/mace/ops/cast.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/cast.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Cast(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Cast")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         CastOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Cast")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         CastOp<DeviceType::CPU, int32_t>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/cast_test.cc b/mace/ops/cast_test.cc
index f35d3af6..a0064993 100644
--- a/mace/ops/cast_test.cc
+++ b/mace/ops/cast_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
deleted file mode 100644
index e13ac92a..00000000
--- a/mace/ops/channel_shuffle.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/channel_shuffle.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ChannelShuffle(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ChannelShuffleOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ChannelShuffleOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ChannelShuffle")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         ChannelShuffleOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/channel_shuffle.h b/mace/ops/channel_shuffle.h
deleted file mode 100644
index a459a0b3..00000000
--- a/mace/ops/channel_shuffle.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_CHANNEL_SHUFFLE_H_
-#define MACE_OPS_CHANNEL_SHUFFLE_H_
-
-#include <memory>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/channel_shuffle.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class ChannelShuffleOp : public Operator<D, T> {
- public:
-  ChannelShuffleOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        group_(OperatorBase::GetOptionalArg<int>("group", 1)),
-        functor_(context, this->group_) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-    int channels;
-    if (D == GPU) {
-      channels = input->dim(3);
-    } else if (D == CPU) {
-      channels = input->dim(1);
-    } else {
-      MACE_NOT_IMPLEMENTED;
-    }
-    MACE_CHECK(channels % group_ == 0,
-               "input channels must be an integral multiple of group. ",
-               input->dim(3));
-    return functor_(input, output, future);
-  }
-
- protected:
-  const int group_;
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-
- private:
-  kernels::ChannelShuffleFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_CHANNEL_SHUFFLE_H_
diff --git a/mace/ops/channel_shuffle_benchmark.cc b/mace/ops/channel_shuffle_benchmark.cc
index 49f494c8..d45216eb 100644
--- a/mace/ops/channel_shuffle_benchmark.cc
+++ b/mace/ops/channel_shuffle_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/channel_shuffle_test.cc b/mace/ops/channel_shuffle_test.cc
index 2102fe76..1ce0cea1 100644
--- a/mace/ops/channel_shuffle_test.cc
+++ b/mace/ops/channel_shuffle_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
deleted file mode 100644
index 6a860a42..00000000
--- a/mace/ops/concat.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/concat.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Concat(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ConcatOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         ConcatOp<DeviceType::CPU, int32_t>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         ConcatOp<DeviceType::CPU, uint8_t>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ConcatOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         ConcatOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/concat.h b/mace/ops/concat.h
deleted file mode 100644
index 94dee3d3..00000000
--- a/mace/ops/concat.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_CONCAT_H_
-#define MACE_OPS_CONCAT_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/concat.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class ConcatOp : public Operator<D, T> {
- public:
-  ConcatOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    MACE_CHECK(this->InputSize() >= 2)
-        << "There must be at least two inputs to concat";
-    const std::vector<const Tensor *> input_list = this->Inputs();
-    const int32_t concat_axis = OperatorBase::GetOptionalArg<int>("axis", 3);
-    const int32_t input_dims = input_list[0]->dim_size();
-    const int32_t axis =
-        concat_axis < 0 ? concat_axis + input_dims : concat_axis;
-    MACE_CHECK((0 <= axis && axis < input_dims),
-               "Expected concatenating axis in the range [", -input_dims, ", ",
-               input_dims, "], but got", concat_axis);
-
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input_list, output, future);
-  }
-
- private:
-  kernels::ConcatFunctor<D, T> functor_;
-
- private:
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_CONCAT_H_
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index 5375cb6d..486d9b6e 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index 671b8f61..431e7a2d 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -16,7 +16,6 @@
 #include <string>
 
 #include "gmock/gmock.h"
-#include "mace/ops/concat.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
deleted file mode 100644
index 516520f9..00000000
--- a/mace/ops/conv_2d.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/conv_2d.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Conv2D(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         Conv2dOp<DeviceType::CPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         Conv2dOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         Conv2dOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Conv2D")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         Conv2dOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h
deleted file mode 100644
index 9f731fa4..00000000
--- a/mace/ops/conv_2d.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_CONV_2D_H_
-#define MACE_OPS_CONV_2D_H_
-
-#include <memory>
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/conv_2d.h"
-#include "mace/ops/conv_pool_2d_base.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class Conv2dOp : public ConvPool2dOpBase<D, T> {
- public:
-  Conv2dOp(const OperatorDef &op_def, OpKernelContext *context)
-      : ConvPool2dOpBase<D, T>(op_def, context),
-        functor_(context,
-                 this->strides_.data(),
-                 this->padding_type_,
-                 this->paddings_,
-                 this->dilations_.data(),
-                 kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *filter = this->Input(FILTER);
-    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
-    Tensor *output = this->Output(OUTPUT);
-    return functor_(input, filter, bias, output, future);
-  }
-
- private:
-  kernels::Conv2dFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_CONV_2D_H_
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 313cd35b..76e3696d 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -14,9 +14,9 @@
 
 #include <algorithm>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/conv_2d.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 354b1935..28037011 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -15,8 +15,7 @@
 #include <fstream>
 #include <vector>
 
-#include "mace/kernels/quantize.h"
-#include "mace/ops/conv_2d.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/core_test.cc b/mace/ops/core_test.cc
index 6c1b25de..5afd621f 100644
--- a/mace/ops/core_test.cc
+++ b/mace/ops/core_test.cc
@@ -52,17 +52,26 @@ TEST(CoreTest, INIT_MODE) {
   NetDef net_def;
   for (auto &op_def : op_defs) {
     net_def.add_op()->CopyFrom(op_def);
+    net_def.add_op_types(op_def.type());
   }
-  std::shared_ptr<OperatorRegistryBase> op_registry(new OperatorRegistry());
-  auto net =
-      CreateNet(op_registry, net_def, &ws, device, NetMode::INIT);
-  net->Run();
+  std::shared_ptr<OpDefRegistryBase> op_def_registry(new OpDefRegistry());
+  std::shared_ptr<OpRegistryBase> op_registry(new OpRegistry());
+  auto net = std::unique_ptr<NetBase>(new SerialNet(
+      op_def_registry.get(), op_registry.get(), &net_def, &ws, device,
+      NetMode::INIT));
+  MaceStatus status = net->Init();
+  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
+  status = net->Run();
+  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
 
   EXPECT_TRUE(ws.GetTensor("B2IOutput") != nullptr);
   EXPECT_TRUE(ws.GetTensor("Output") == nullptr);
-
-  net = CreateNet(op_registry, net_def, &ws, device);
-  net->Run();
+  net = std::unique_ptr<NetBase>(new SerialNet(
+      op_def_registry.get(), op_registry.get(), &net_def, &ws, device));
+  status = net->Init();
+  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
+  status = net->Run();
+  MACE_CHECK(status == MaceStatus::MACE_SUCCESS);
   EXPECT_TRUE(ws.GetTensor("Output") != nullptr);
 
   ExpectTensorNear<float>(*ws.GetTensor("Input"), *ws.GetTensor("Output"),
diff --git a/mace/ops/crop.cc b/mace/ops/crop.cc
deleted file mode 100644
index 7ed2e9c0..00000000
--- a/mace/ops/crop.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/crop.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Crop(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Crop")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         CropOp<DeviceType::CPU, float>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Crop")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         CropOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Crop")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         CropOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/crop.h b/mace/ops/crop.h
deleted file mode 100644
index f5045069..00000000
--- a/mace/ops/crop.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_CROP_H_
-#define MACE_OPS_CROP_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/crop.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class CropOp : public Operator<D, T> {
- public:
-  CropOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("axis", 2),
-                 OperatorBase::GetRepeatedArgs<int>("offset")) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    MACE_CHECK(this->InputSize() >= 2)
-        << "There must be two inputs to crop";
-    const std::vector<const Tensor *> input_list = this->Inputs();
-    Tensor *output = this->Output(0);
-    return functor_(input_list, output, future);
-  }
-
- private:
-  kernels::CropFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_CROP_H_
diff --git a/mace/ops/crop_benchmark.cc b/mace/ops/crop_benchmark.cc
index e3d22a07..75cd494f 100644
--- a/mace/ops/crop_benchmark.cc
+++ b/mace/ops/crop_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/crop_test.cc b/mace/ops/crop_test.cc
index b4bb7fdd..67a2fdeb 100644
--- a/mace/ops/crop_test.cc
+++ b/mace/ops/crop_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
deleted file mode 100644
index af0d7232..00000000
--- a/mace/ops/deconv_2d.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/deconv_2d.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Deconv2D(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         Deconv2dOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         Deconv2dOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Deconv2D")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         Deconv2dOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/deconv_2d.h b/mace/ops/deconv_2d.h
deleted file mode 100644
index 03c4581d..00000000
--- a/mace/ops/deconv_2d.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_DECONV_2D_H_
-#define MACE_OPS_DECONV_2D_H_
-
-#include <memory>
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/deconv_2d.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class Deconv2dOp : public Operator<D, T> {
- public:
-  Deconv2dOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 OperatorBase::GetRepeatedArgs<int>("strides"),
-                 static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
-                     "padding", static_cast<int>(SAME))),
-                 OperatorBase::GetRepeatedArgs<int>("padding_values"),
-                 static_cast<kernels::FrameworkType>(
-                     OperatorBase::GetOptionalArg<int>("framework_type", 0)),
-                 kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    MACE_CHECK(this->InputSize() >= 2, "deconv needs >= 2 inputs.");
-    const Tensor *input = this->Input(0);
-    const Tensor *filter = this->Input(1);
-    kernels::FrameworkType model_type =
-        static_cast<kernels::FrameworkType>(
-            OperatorBase::GetOptionalArg<int>("framework_type", 0));
-    if (model_type == kernels::CAFFE) {
-      const Tensor *bias = this->InputSize() >= 3 ? this->Input(2) : nullptr;
-      Tensor *output = this->Output(OUTPUT);
-
-      return functor_(input, filter, bias, nullptr, output, future);
-    } else {
-      const Tensor *output_shape =
-          this->InputSize() >= 3 ? this->Input(2) : nullptr;
-      const Tensor *bias = this->InputSize() >= 4 ? this->Input(3) : nullptr;
-      Tensor *output = this->Output(OUTPUT);
-
-      return functor_(input, filter, bias, output_shape, output, future);
-    }
-  }
-
- private:
-  kernels::Deconv2dFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_DECONV_2D_H_
diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc
index cece56ce..197e8f73 100644
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -14,9 +14,9 @@
 
 #include <algorithm>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/deconv_2d.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/deconv_2d_test.cc b/mace/ops/deconv_2d_test.cc
index ef4d426f..88476414 100644
--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -15,7 +15,8 @@
 #include <fstream>
 #include <vector>
 
-#include "mace/ops/deconv_2d.h"
+#include "mace/kernels/deconv_2d.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/depth_to_space.cc b/mace/ops/depth_to_space.cc
deleted file mode 100644
index 0da2bb00..00000000
--- a/mace/ops/depth_to_space.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/depth_to_space.h"
-
-namespace mace {
-namespace ops {
-
-void Register_DepthToSpace(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         DepthToSpaceOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         DepthToSpaceOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthToSpace")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         DepthToSpaceOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/depth_to_space.h b/mace/ops/depth_to_space.h
deleted file mode 100644
index c2946b84..00000000
--- a/mace/ops/depth_to_space.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_DEPTH_TO_SPACE_H_
-#define MACE_OPS_DEPTH_TO_SPACE_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/depth_to_space.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class DepthToSpaceOp : public Operator<D, T> {
- public:
-  DepthToSpaceOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        block_size_(OperatorBase::GetOptionalArg<int>("block_size", 1)),
-        functor_(context, this->block_size_) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
-
-    return functor_(input, output, future);
-  }
-
- protected:
-  const int block_size_;
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-
- private:
-  kernels::DepthToSpaceOpFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_DEPTH_TO_SPACE_H_
diff --git a/mace/ops/depth_to_space_benchmark.cc b/mace/ops/depth_to_space_benchmark.cc
index 43115167..45bc6036 100644
--- a/mace/ops/depth_to_space_benchmark.cc
+++ b/mace/ops/depth_to_space_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/depth_to_space_test.cc b/mace/ops/depth_to_space_test.cc
index 768f7c1a..fdce99c1 100644
--- a/mace/ops/depth_to_space_test.cc
+++ b/mace/ops/depth_to_space_test.cc
@@ -15,7 +15,7 @@
 #include <fstream>
 
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/depthwise_conv2d.cc b/mace/ops/depthwise_conv2d.cc
deleted file mode 100644
index 61f87e5f..00000000
--- a/mace/ops/depthwise_conv2d.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/depthwise_conv2d.h"
-
-namespace mace {
-namespace ops {
-
-void Register_DepthwiseConv2d(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         DepthwiseConv2dOp<DeviceType::CPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         DepthwiseConv2dOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         DepthwiseConv2dOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("DepthwiseConv2d")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         DepthwiseConv2dOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h
deleted file mode 100644
index 549af07a..00000000
--- a/mace/ops/depthwise_conv2d.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_DEPTHWISE_CONV2D_H_
-#define MACE_OPS_DEPTHWISE_CONV2D_H_
-
-#include <memory>
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/conv_2d.h"
-#include "mace/kernels/depthwise_conv2d.h"
-#include "mace/ops/conv_pool_2d_base.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
- public:
-  DepthwiseConv2dOp(const OperatorDef &op_def, OpKernelContext *context)
-      : ConvPool2dOpBase<D, T>(op_def, context),
-        functor_(context,
-                 this->strides_.data(),
-                 this->padding_type_,
-                 this->paddings_,
-                 this->dilations_.data(),
-                 kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *filter = this->Input(FILTER);
-    const Tensor *bias = nullptr;
-    if (this->InputSize() >= 3) {
-      bias = this->Input(BIAS);
-    }
-    Tensor *output = this->Output(OUTPUT);
-    return functor_(input, filter, bias, output, future);
-  }
-
- private:
-  kernels::DepthwiseConv2dFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_DEPTHWISE_CONV2D_H_
diff --git a/mace/ops/depthwise_conv2d_benchmark.cc b/mace/ops/depthwise_conv2d_benchmark.cc
index 60abfaf3..3257e580 100644
--- a/mace/ops/depthwise_conv2d_benchmark.cc
+++ b/mace/ops/depthwise_conv2d_benchmark.cc
@@ -14,9 +14,9 @@
 
 #include <algorithm>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/ops/conv_2d.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index 39dd6944..3089286c 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/ops/conv_2d.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/eltwise.cc b/mace/ops/eltwise.cc
deleted file mode 100644
index 2e82fb70..00000000
--- a/mace/ops/eltwise.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/eltwise.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Eltwise(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         EltwiseOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         EltwiseOp<DeviceType::CPU, int32_t>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         EltwiseOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         EltwiseOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Eltwise")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         EltwiseOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/eltwise.h b/mace/ops/eltwise.h
deleted file mode 100644
index f7952562..00000000
--- a/mace/ops/eltwise.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_ELTWISE_H_
-#define MACE_OPS_ELTWISE_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/eltwise.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class EltwiseOp : public Operator<D, T> {
- public:
-  EltwiseOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(
-            context,
-            static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
-                "type", static_cast<int>(kernels::EltwiseType::NONE))),
-            OperatorBase::GetRepeatedArgs<float>("coeff"),
-            OperatorBase::GetOptionalArg<float>("scalar_input", 1.0),
-            OperatorBase::GetOptionalArg<int32_t>("scalar_input_index", 1),
-            static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
-                "data_format", 0))) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input0 = this->Input(0);
-    const Tensor *input1 = this->InputSize() == 2 ? this->Input(1) : nullptr;
-    Tensor *output = this->Output(OUTPUT);
-    return functor_(input0, input1, output, future);
-  }
-
- private:
-  kernels::EltwiseFunctor<D, T> functor_;
-
- private:
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_ELTWISE_H_
diff --git a/mace/ops/eltwise_benchmark.cc b/mace/ops/eltwise_benchmark.cc
index d12c97b3..4a8fa041 100644
--- a/mace/ops/eltwise_benchmark.cc
+++ b/mace/ops/eltwise_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/kernels/eltwise.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/eltwise_test.cc b/mace/ops/eltwise_test.cc
index d1506987..da9687ce 100644
--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "mace/kernels/eltwise.h"
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/expand_dims.cc b/mace/ops/expand_dims.cc
deleted file mode 100644
index 5b10d5a3..00000000
--- a/mace/ops/expand_dims.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/expand_dims.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ExpandDims(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ExpandDims")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ExpandDimsOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ExpandDims")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         ExpandDimsOp<DeviceType::CPU, int32_t>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ExpandDims")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         ExpandDimsOp<DeviceType::CPU, uint8_t>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/expand_dims.h b/mace/ops/expand_dims.h
deleted file mode 100644
index b7363c3c..00000000
--- a/mace/ops/expand_dims.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_EXPAND_DIMS_H_
-#define MACE_OPS_EXPAND_DIMS_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/expand_dims.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class ExpandDimsOp : public Operator<D, T> {
- public:
-  ExpandDimsOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input, output, future);
-  }
-
- private:
-  kernels::ExpandDimsFunctor<D, T> functor_;
-
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_EXPAND_DIMS_H_
diff --git a/mace/ops/expand_dims_test.cc b/mace/ops/expand_dims_test.cc
index f5650c9c..ac3312ea 100644
--- a/mace/ops/expand_dims_test.cc
+++ b/mace/ops/expand_dims_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/fill.cc b/mace/ops/fill.cc
deleted file mode 100644
index 93e6dadd..00000000
--- a/mace/ops/fill.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/fill.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Fill(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Fill")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         FillOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/fill.h b/mace/ops/fill.h
deleted file mode 100644
index b6836d11..00000000
--- a/mace/ops/fill.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_FILL_H_
-#define MACE_OPS_FILL_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/fill.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class FillOp : public Operator<D, T> {
- public:
-  FillOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *shape = this->Input(SHAPE);
-    const Tensor *value = this->Input(VALUE);
-    Tensor *output = this->Output(OUTPUT);
-    return functor_(shape, value, output, future);
-  }
-
- private:
-  kernels::FillFunctor<D, T> functor_;
-
-  MACE_OP_INPUT_TAGS(SHAPE, VALUE);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_FILL_H_
diff --git a/mace/ops/fill_test.cc b/mace/ops/fill_test.cc
index 1808b0b5..8ecbed5d 100644
--- a/mace/ops/fill_test.cc
+++ b/mace/ops/fill_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc
deleted file mode 100644
index f7600750..00000000
--- a/mace/ops/folded_batch_norm.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/folded_batch_norm.h"
-
-namespace mace {
-namespace ops {
-
-void Register_FoldedBatchNorm(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         FoldedBatchNormOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         FoldedBatchNormOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         FoldedBatchNormOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/folded_batch_norm.h b/mace/ops/folded_batch_norm.h
deleted file mode 100644
index 345d87b4..00000000
--- a/mace/ops/folded_batch_norm.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_FOLDED_BATCH_NORM_H_
-#define MACE_OPS_FOLDED_BATCH_NORM_H_
-
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/batch_norm.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class FoldedBatchNormOp : public Operator<D, T> {
- public:
-  FoldedBatchNormOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 true,
-                 kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *scale = this->Input(SCALE);
-    const Tensor *offset = this->Input(OFFSET);
-
-    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
-               input->dim_size());
-    MACE_CHECK(scale->dim_size() == 1, "scale must be 1-dimensional. ",
-               scale->dim_size());
-    MACE_CHECK(offset->dim_size() == 1, "offset must be 1-dimensional. ",
-               offset->dim_size());
-
-    Tensor *output = this->Output(OUTPUT);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-    return functor_(input, scale, offset, nullptr, nullptr, 0, output, future);
-  }
-
- private:
-  kernels::BatchNormFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_FOLDED_BATCH_NORM_H_
diff --git a/mace/ops/folded_batch_norm_test.cc b/mace/ops/folded_batch_norm_test.cc
index 16a6ad68..a19d7d77 100644
--- a/mace/ops/folded_batch_norm_test.cc
+++ b/mace/ops/folded_batch_norm_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
@@ -51,7 +51,7 @@ void Simple() {
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<D, float>("Input", NHWC, "InputNCHW", NCHW);
-    OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+    OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
         .Input("InputNCHW")
         .Input("Scale")
         .Input("Offset")
@@ -68,7 +68,7 @@ void Simple() {
     BufferToImage<D, float>(&net, "Offset", "OffsetImage",
                             kernels::BufferType::ARGUMENT);
 
-    OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+    OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
         .Input("InputImage")
         .Input("ScaleImage")
         .Input("OffsetImage")
@@ -115,7 +115,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
       .Input("Scale")
       .Input("Offset")
@@ -140,7 +140,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomOPENCL) {
   BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                         kernels::BufferType::ARGUMENT);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
       .Input("ScaleImage")
       .Input("OffsetImage")
@@ -177,7 +177,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
       .Input("Scale")
       .Input("Offset")
@@ -202,7 +202,7 @@ TEST_F(FoldedBatchNormOpTest, SimpleRandomHalfOPENCL) {
   BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                        kernels::BufferType::ARGUMENT);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
       .Input("ScaleImage")
       .Input("OffsetImage")
@@ -240,7 +240,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
       .Input("Scale")
       .Input("Offset")
@@ -265,7 +265,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomOPENCL) {
   BufferToImage<DeviceType::GPU, float>(&net, "Offset", "OffsetImage",
                                         kernels::BufferType::ARGUMENT);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
       .Input("ScaleImage")
       .Input("OffsetImage")
@@ -301,7 +301,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
   net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                   NCHW);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputNCHW")
       .Input("Scale")
       .Input("Offset")
@@ -326,7 +326,7 @@ TEST_F(FoldedBatchNormOpTest, ComplexRandomHalfOPENCL) {
   BufferToImage<DeviceType::GPU, half>(&net, "Offset", "OffsetImage",
                                        kernels::BufferType::ARGUMENT);
 
-  OpDefBuilder("FoldedBatchNorm", "FoldedBatchNormTest")
+  OpDefBuilder("BatchNorm", "FoldedBatchNormTest")
       .Input("InputImage")
       .Input("ScaleImage")
       .Input("OffsetImage")
diff --git a/mace/ops/fully_connected.cc b/mace/ops/fully_connected.cc
deleted file mode 100644
index 31f3bf86..00000000
--- a/mace/ops/fully_connected.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/fully_connected.h"
-
-namespace mace {
-namespace ops {
-
-void Register_FullyConnected(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         FullyConnectedOp<DeviceType::CPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         FullyConnectedOp<DeviceType::CPU, uint8_t>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         FullyConnectedOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("FullyConnected")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         FullyConnectedOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/fully_connected.h b/mace/ops/fully_connected.h
deleted file mode 100644
index 313780cb..00000000
--- a/mace/ops/fully_connected.h
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_FULLY_CONNECTED_H_
-#define MACE_OPS_FULLY_CONNECTED_H_
-
-#include <string>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/fully_connected.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class FullyConnectedOp : public Operator<D, T> {
- public:
-  FullyConnectedOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context, kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *weight = this->Input(WEIGHT);  // OIHW
-    const Tensor *bias = this->InputSize() >= 3 ? this->Input(BIAS) : nullptr;
-    Tensor *output = this->Output(OUTPUT);
-
-    if (D == DeviceType::CPU) {
-      MACE_CHECK(
-          input->dim(1) == weight->dim(1) && input->dim(2) == weight->dim(2) &&
-              input->dim(3) == weight->dim(3),
-          "The shape of Input: ", MakeString(input->shape()),
-          "The shape of Weight: ", MakeString(weight->shape()),
-          " don't match.");
-    } else {
-      MACE_CHECK(
-          input->dim(1) == weight->dim(2) && input->dim(2) == weight->dim(3) &&
-              input->dim(3) == weight->dim(1),
-          "The shape of Input: ", MakeString(input->shape()),
-          "The shape of Weight: ", MakeString(weight->shape()),
-          " don't match.");
-    }
-    if (bias) {
-      MACE_CHECK(weight->dim(0) == bias->dim(0),
-                 "The shape of Weight: ", MakeString(weight->shape()),
-                 " and shape of Bias: ", bias->dim(0),
-                 " don't match.");
-    }
-
-    return functor_(input, weight,
-                    bias, output, future);
-  }
-
- private:
-  kernels::FullyConnectedFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_FULLY_CONNECTED_H_
diff --git a/mace/ops/fully_connected_benchmark.cc b/mace/ops/fully_connected_benchmark.cc
index 9f0fe549..66af8792 100644
--- a/mace/ops/fully_connected_benchmark.cc
+++ b/mace/ops/fully_connected_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/fully_connected_test.cc b/mace/ops/fully_connected_test.cc
index 4bdc53f4..d075aac2 100644
--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -14,8 +14,7 @@
 
 #include <fstream>
 
-#include "mace/core/operator.h"
-#include "mace/kernels/quantize.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/gather.cc b/mace/ops/gather.cc
deleted file mode 100644
index 12891c5d..00000000
--- a/mace/ops/gather.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/gather.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Gather(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Gather")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         GatherOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/gather.h b/mace/ops/gather.h
deleted file mode 100644
index fe4026d9..00000000
--- a/mace/ops/gather.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_GATHER_H_
-#define MACE_OPS_GATHER_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/gather.h"
-
-namespace mace {
-namespace ops {
-
-template<DeviceType D, class T>
-class GatherOp : public Operator<D, T> {
- public:
-  GatherOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("axis", 0),
-                 OperatorBase::GetOptionalArg<float>("y", 1.0)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *params = this->Input(PARAMS);
-    const Tensor *indices = this->Input(INDICES);
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(params, indices, output, future);
-  }
-
- private:
-  kernels::GatherFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(PARAMS, INDICES);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_GATHER_H_
diff --git a/mace/ops/gather_benchmark.cc b/mace/ops/gather_benchmark.cc
index f55b7462..8a0cd123 100644
--- a/mace/ops/gather_benchmark.cc
+++ b/mace/ops/gather_benchmark.cc
@@ -14,9 +14,8 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
-#include "mace/kernels/gather.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/gather_test.cc b/mace/ops/gather_test.cc
index 07a8438c..c716b12a 100644
--- a/mace/ops/gather_test.cc
+++ b/mace/ops/gather_test.cc
@@ -14,7 +14,7 @@
 
 #include <fstream>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/identity.cc b/mace/ops/identity.cc
deleted file mode 100644
index 61a33356..00000000
--- a/mace/ops/identity.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/identity.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Identity(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         IdentityOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         IdentityOp<DeviceType::CPU, int32_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         IdentityOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Identity")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         IdentityOp<DeviceType::GPU, half>);
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/identity_test.cc b/mace/ops/identity_test.cc
index 988ce760..1ef8848d 100644
--- a/mace/ops/identity_test.cc
+++ b/mace/ops/identity_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/infer_conv2d_shape.cc b/mace/ops/infer_conv2d_shape.cc
deleted file mode 100644
index 26aec062..00000000
--- a/mace/ops/infer_conv2d_shape.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/infer_conv2d_shape.h"
-
-namespace mace {
-namespace ops {
-
-void Register_InferConv2dShape(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         InferConv2dShapeOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         InferConv2dShapeOp<DeviceType::CPU, int32_t>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         InferConv2dShapeOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("InferConv2dShape")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         InferConv2dShapeOp<DeviceType::GPU, half>);
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/infer_conv2d_shape_test.cc b/mace/ops/infer_conv2d_shape_test.cc
index 4f2e0b76..735a599c 100644
--- a/mace/ops/infer_conv2d_shape_test.cc
+++ b/mace/ops/infer_conv2d_shape_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 
diff --git a/mace/ops/local_response_norm.cc b/mace/ops/local_response_norm.cc
deleted file mode 100644
index f3e19970..00000000
--- a/mace/ops/local_response_norm.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/local_response_norm.h"
-
-namespace mace {
-namespace ops {
-
-void Register_LocalResponseNorm(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LocalResponseNorm")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         LocalResponseNormOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/local_response_norm.h b/mace/ops/local_response_norm.h
deleted file mode 100644
index 66265f19..00000000
--- a/mace/ops/local_response_norm.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_LOCAL_RESPONSE_NORM_H_
-#define MACE_OPS_LOCAL_RESPONSE_NORM_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/local_response_norm.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class LocalResponseNormOp : public Operator<D, T> {
- public:
-  LocalResponseNormOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context), functor_(context) {
-    depth_radius_ = OperatorBase::GetOptionalArg<int>("depth_radius", 5);
-    bias_ = OperatorBase::GetOptionalArg<float>("bias", 1.0f);
-    alpha_ = OperatorBase::GetOptionalArg<float>("alpha", 1.0f);
-    beta_ = OperatorBase::GetOptionalArg<float>("beta", 0.5f);
-  }
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-
-    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional. ",
-               input->dim_size());
-
-    Tensor *output = this->Output(OUTPUT);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-    return functor_(input, depth_radius_, bias_, alpha_, beta_, output, future);
-  }
-
- private:
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-  kernels::LocalResponseNormFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_LOCAL_RESPONSE_NORM_H_
diff --git a/mace/ops/local_response_norm_benchmark.cc b/mace/ops/local_response_norm_benchmark.cc
index ee15c3e0..893b65d1 100644
--- a/mace/ops/local_response_norm_benchmark.cc
+++ b/mace/ops/local_response_norm_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/local_response_norm_test.cc b/mace/ops/local_response_norm_test.cc
index 6bb726ea..55adcedd 100644
--- a/mace/ops/local_response_norm_test.cc
+++ b/mace/ops/local_response_norm_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/lstmcell.cc b/mace/ops/lstmcell.cc
deleted file mode 100644
index 9926ad4b..00000000
--- a/mace/ops/lstmcell.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/lstmcell.h"
-
-namespace mace {
-namespace ops {
-
-void Register_LSTMCell(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LSTMCell")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         LSTMCellOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("LSTMCell")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         LSTMCellOp<DeviceType::GPU, half>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/lstmcell_benchmark.cc b/mace/ops/lstmcell_benchmark.cc
index 6ab6baa1..a1972e72 100644
--- a/mace/ops/lstmcell_benchmark.cc
+++ b/mace/ops/lstmcell_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/lstmcell_test_util.h"
diff --git a/mace/ops/lstmcell_test.cc b/mace/ops/lstmcell_test.cc
index 1cfaad01..5b26c677 100644
--- a/mace/ops/lstmcell_test.cc
+++ b/mace/ops/lstmcell_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/kernels/eltwise.h"
 #include "mace/ops/lstmcell_test_util.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/lstmcell_test_util.h b/mace/ops/lstmcell_test_util.h
index 06d71151..bbd523c9 100644
--- a/mace/ops/lstmcell_test_util.h
+++ b/mace/ops/lstmcell_test_util.h
@@ -17,7 +17,7 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/kernels/eltwise.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/matmul.cc b/mace/ops/matmul.cc
deleted file mode 100644
index ca0b68e5..00000000
--- a/mace/ops/matmul.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/matmul.h"
-
-namespace mace {
-namespace ops {
-
-void Register_MatMul(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         MatMulOp<DeviceType::CPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         MatMulOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         MatMulOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("MatMul")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         MatMulOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/matmul.h b/mace/ops/matmul.h
deleted file mode 100644
index 64b336a3..00000000
--- a/mace/ops/matmul.h
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_MATMUL_H_
-#define MACE_OPS_MATMUL_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/matmul.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class MatMulOp : public Operator<D, T> {
- public:
-  MatMulOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context),
-        transpose_a_(OperatorBase::GetOptionalArg<bool>("transpose_a", false)),
-        transpose_b_(OperatorBase::GetOptionalArg<bool>("transpose_b", false)) {
-  }
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *A = this->Input(INPUT_A);
-    const Tensor *B = this->Input(INPUT_B);
-    Tensor *C = this->Output(OUTPUT);
-    MACE_CHECK(A->dim_size() == B->dim_size() && A->dim_size() >= 2,
-               "rank(A) should be equal to rank(B), rank should be greater "
-               "than or equal to 2");
-    index_t rank = A->dim_size();
-    for (index_t i = 0; i < rank - 2; ++i) {
-      MACE_CHECK(A->dim(i) == B->dim(i),
-                 "batch dimensions are not equal: ",
-                 A->dim(i),
-                 " vs. ",
-                 B->dim(i));
-    }
-    index_t ak = transpose_a_ ? A->dim(rank - 2) : A->dim(rank - 1);
-    index_t bk = transpose_b_ ? B->dim(rank - 1) : B->dim(rank - 2);
-    MACE_CHECK(ak == bk, "the number of A's column ", ak,
-               " must be equal to B's row ", bk);
-
-    return functor_(A, B, C,
-                    transpose_a_, transpose_b_, future);
-  }
-
- private:
-  MACE_OP_INPUT_TAGS(INPUT_A, INPUT_B);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-
-  kernels::MatMulFunctor<D, T> functor_;
-  bool transpose_a_;
-  bool transpose_b_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_MATMUL_H_
diff --git a/mace/ops/matmul_benchmark.cc b/mace/ops/matmul_benchmark.cc
index 08b06fa7..c553e33d 100644
--- a/mace/ops/matmul_benchmark.cc
+++ b/mace/ops/matmul_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/matmul_test.cc b/mace/ops/matmul_test.cc
index 9225b226..83958c75 100644
--- a/mace/ops/matmul_test.cc
+++ b/mace/ops/matmul_test.cc
@@ -14,7 +14,7 @@
 
 #include <fstream>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/ops_def_register.cc b/mace/ops/ops_def_register.cc
new file mode 100644
index 00000000..46ee5184
--- /dev/null
+++ b/mace/ops/ops_def_register.cc
@@ -0,0 +1,373 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/ops_def_register.h"
+
+#include <vector>
+
+namespace mace {
+namespace ops {
+
+void RegisterOpDefs(OpDefRegistryBase *op_def_registry) {
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Activation")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("AddN")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("ArgMax")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("BatchNorm")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("BatchToSpaceND")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("BiasAdd")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("BufferInverseTransform")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("BufferTransform")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Cast")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("ChannelShuffle")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Concat")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Conv2D")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Crop")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Deconv2D")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("DepthToSpace")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("DepthwiseConv2d")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Dequantize")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Eltwise")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("ExpandDims")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Fill")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("FullyConnected")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Gather")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Identity")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("InferConv2dShape")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("LocalResponseNorm")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("LSTMCell")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("MatMul")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Pad")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Pooling")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Quantize")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("ReduceMean")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Reshape")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("ResizeBicubic")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("ResizeBilinear")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Reverse")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("ScalarMath")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Shape")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Softmax")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("SpaceToBatchND")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("SpaceToDepth")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Split")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("SqrDiffMean")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Squeeze")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Stack")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("StridedSlice")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Transpose")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("Unstack")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::CPU, DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("WinogradInverseTransform")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::GPU};
+          }));
+
+  MACE_REGISTER_OP_DEF(
+      op_def_registry,
+      OpRegistrationBuilder("WinogradTransform")
+          .SetDevicePlaceFunc([]() -> std::vector<DeviceType> {
+            return {DeviceType::GPU};
+          }));
+}
+}  // namespace ops
+
+
+OpDefRegistry::OpDefRegistry() : OpDefRegistryBase() {
+  ops::RegisterOpDefs(this);
+}
+
+}  // namespace mace
diff --git a/mace/kernels/kernel.h b/mace/ops/ops_def_register.h
similarity index 68%
rename from mace/kernels/kernel.h
rename to mace/ops/ops_def_register.h
index 853e974f..5b2d6acb 100644
--- a/mace/kernels/kernel.h
+++ b/mace/ops/ops_def_register.h
@@ -12,20 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef MACE_KERNELS_KERNEL_H_
-#define MACE_KERNELS_KERNEL_H_
+#ifndef MACE_OPS_OPS_DEF_REGISTER_H_
+#define MACE_OPS_OPS_DEF_REGISTER_H_
 
-#include "mace/core/op_kernel_context.h"
+#include "mace/core/op_def_registry.h"
 
 namespace mace {
-namespace kernels {
 
-struct OpKernel {
-  explicit OpKernel(OpKernelContext *context): context_(context) {}
-
-  OpKernelContext *context_;
+class OpDefRegistry : public OpDefRegistryBase {
+ public:
+  OpDefRegistry();
+  ~OpDefRegistry() override = default;
 };
 
-}  // namespace kernels
 }  // namespace mace
-#endif  //  MACE_KERNELS_KERNEL_H_
+
+#endif  // MACE_OPS_OPS_DEF_REGISTER_H_
diff --git a/mace/ops/ops_register.cc b/mace/ops/ops_register.cc
deleted file mode 100644
index 1c29386c..00000000
--- a/mace/ops/ops_register.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/ops_register.h"
-
-namespace mace {
-
-namespace ops {
-// Keep in lexicographical order
-extern void Register_Activation(OperatorRegistryBase *op_registry);
-extern void Register_AddN(OperatorRegistryBase *op_registry);
-extern void Register_ArgMax(OperatorRegistryBase *op_registry);
-extern void Register_BatchNorm(OperatorRegistryBase *op_registry);
-extern void Register_BatchToSpaceND(OperatorRegistryBase *op_registry);
-extern void Register_BiasAdd(OperatorRegistryBase *op_registry);
-extern void Register_Cast(OperatorRegistryBase *op_registry);
-extern void Register_ChannelShuffle(OperatorRegistryBase *op_registry);
-extern void Register_Concat(OperatorRegistryBase *op_registry);
-extern void Register_Conv2D(OperatorRegistryBase *op_registry);
-extern void Register_Crop(OperatorRegistryBase *op_registry);
-extern void Register_Deconv2D(OperatorRegistryBase *op_registry);
-extern void Register_DepthToSpace(OperatorRegistryBase *op_registry);
-extern void Register_DepthwiseConv2d(OperatorRegistryBase *op_registry);
-extern void Register_Dequantize(OperatorRegistryBase *op_registry);
-extern void Register_Eltwise(OperatorRegistryBase *op_registry);
-extern void Register_ExpandDims(OperatorRegistryBase *op_registry);
-extern void Register_Fill(OperatorRegistryBase *op_registry);
-extern void Register_FoldedBatchNorm(OperatorRegistryBase *op_registry);
-extern void Register_FullyConnected(OperatorRegistryBase *op_registry);
-extern void Register_Gather(OperatorRegistryBase *op_registry);
-extern void Register_Identity(OperatorRegistryBase *op_registry);
-extern void Register_InferConv2dShape(OperatorRegistryBase *op_registry);
-extern void Register_LocalResponseNorm(OperatorRegistryBase *op_registry);
-extern void Register_MatMul(OperatorRegistryBase *op_registry);
-extern void Register_Pad(OperatorRegistryBase *op_registry);
-extern void Register_Pooling(OperatorRegistryBase *op_registry);
-extern void Register_Proposal(OperatorRegistryBase *op_registry);
-extern void Register_Quantize(OperatorRegistryBase *op_registry);
-extern void Register_ReduceMean(OperatorRegistryBase *op_registry);
-extern void Register_Reshape(OperatorRegistryBase *op_registry);
-extern void Register_ResizeBicubic(OperatorRegistryBase *op_registry);
-extern void Register_ResizeBilinear(OperatorRegistryBase *op_registry);
-extern void Register_Reverse(OperatorRegistryBase *op_registry);
-extern void Register_ScalarMath(OperatorRegistryBase *op_registry);
-extern void Register_Shape(OperatorRegistryBase *op_registry);
-extern void Register_Softmax(OperatorRegistryBase *op_registry);
-extern void Register_SpaceToBatchND(OperatorRegistryBase *op_registry);
-extern void Register_SpaceToDepth(OperatorRegistryBase *op_registry);
-extern void Register_Split(OperatorRegistryBase *op_registry);
-extern void Register_SqrDiffMean(OperatorRegistryBase *op_registry);
-extern void Register_Squeeze(OperatorRegistryBase *op_registry);
-extern void Register_Stack(OperatorRegistryBase *op_registry);
-extern void Register_StridedSlice(OperatorRegistryBase *op_registry);
-extern void Register_Transpose(OperatorRegistryBase *op_registry);
-extern void Register_Unstack(OperatorRegistryBase *op_registry);
-extern void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry);  // NOLINT(whitespace/line_length)
-extern void Register_WinogradTransform(OperatorRegistryBase *op_registry);
-
-#ifdef MACE_ENABLE_OPENCL
-extern void Register_BufferTransform(OperatorRegistryBase *op_registry);
-extern void Register_BufferInverseTransform(OperatorRegistryBase *op_registry);
-extern void Register_LSTMCell(OperatorRegistryBase *op_registry);
-#endif  // MACE_ENABLE_OPENCL
-}  // namespace ops
-
-
-OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {
-  // Keep in lexicographical order
-  ops::Register_Activation(this);
-  ops::Register_AddN(this);
-  ops::Register_ArgMax(this);
-  ops::Register_BatchNorm(this);
-  ops::Register_BatchToSpaceND(this);
-  ops::Register_BiasAdd(this);
-  ops::Register_Cast(this);
-  ops::Register_ChannelShuffle(this);
-  ops::Register_Concat(this);
-  ops::Register_Conv2D(this);
-  ops::Register_Crop(this);
-  ops::Register_Deconv2D(this);
-  ops::Register_DepthToSpace(this);
-  ops::Register_DepthwiseConv2d(this);
-  ops::Register_Dequantize(this);
-  ops::Register_Eltwise(this);
-  ops::Register_ExpandDims(this);
-  ops::Register_Fill(this);
-  ops::Register_FoldedBatchNorm(this);
-  ops::Register_FullyConnected(this);
-  ops::Register_Gather(this);
-  ops::Register_Identity(this);
-  ops::Register_InferConv2dShape(this);
-  ops::Register_LocalResponseNorm(this);
-  ops::Register_MatMul(this);
-  ops::Register_Pad(this);
-  ops::Register_Pooling(this);
-  ops::Register_Proposal(this);
-  ops::Register_Quantize(this);
-  ops::Register_ReduceMean(this);
-  ops::Register_Reshape(this);
-  ops::Register_ResizeBicubic(this);
-  ops::Register_ResizeBilinear(this);
-  ops::Register_Reverse(this);
-  ops::Register_ScalarMath(this);
-  ops::Register_Shape(this);
-  ops::Register_Softmax(this);
-  ops::Register_SpaceToBatchND(this);
-  ops::Register_SpaceToDepth(this);
-  ops::Register_Split(this);
-  ops::Register_Stack(this);
-  ops::Register_StridedSlice(this);
-  ops::Register_SqrDiffMean(this);
-  ops::Register_Squeeze(this);
-  ops::Register_Transpose(this);
-  ops::Register_Unstack(this);
-  ops::Register_WinogradInverseTransform(this);
-  ops::Register_WinogradTransform(this);
-
-#ifdef MACE_ENABLE_OPENCL
-  ops::Register_BufferTransform(this);
-  ops::Register_BufferInverseTransform(this);
-  ops::Register_LSTMCell(this);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace mace
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index a3b8c4d9..4823bd80 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -32,7 +32,9 @@
 #include "mace/core/tensor.h"
 #include "mace/core/workspace.h"
 #include "mace/kernels/opencl/common.h"
-#include "mace/ops/ops_register.h"
+#include "mace/kernels/ops_register.h"
+#include "mace/ops/ops_def_register.h"
+#include "mace/public/mace.h"
 #include "mace/utils/utils.h"
 #include "mace/utils/quantize.h"
 
@@ -139,8 +141,8 @@ class OpTestContext {
 class OpsTestNet {
  public:
   OpsTestNet() :
-    op_registry_(new OperatorRegistry()) {
-  }
+    op_def_registry_(new OpDefRegistry()),
+    op_registry_(new OpRegistry()) {}
 
   template <DeviceType D, typename T>
   void AddInputFromArray(const std::string &name,
@@ -453,16 +455,24 @@ class OpsTestNet {
     NetDef net_def;
     for (auto &op_def_ : op_defs_) {
       net_def.add_op()->CopyFrom(op_def_);
+      net_def.add_op_types(op_def_.type());
     }
-    net_ = CreateNet(op_registry_, net_def, &ws_,
-                     OpTestContext::Get()->GetDevice(device));
+    net_ = std::unique_ptr<NetBase>(new SerialNet(
+        op_def_registry_.get(),
+        op_registry_.get(),
+        &net_def,
+        &ws_,
+        OpTestContext::Get()->GetDevice(device)));
+    MaceStatus status = net_->Init();
     device_type_ = device;
-    return net_ != nullptr;
+    return status == MaceStatus::MACE_SUCCESS;
   }
 
   MaceStatus Run() {
     MACE_CHECK_NOTNULL(net_);
-    return net_->Run();
+    MACE_RETURN_IF_ERROR(net_->Run());
+    Sync();
+    return MaceStatus::MACE_SUCCESS;
   }
 
   // DEPRECATED(liyin):
@@ -477,7 +487,7 @@ class OpsTestNet {
         Setup(device);
         MACE_RETURN_IF_ERROR(Run());
       }
-      return MACE_SUCCESS;
+      return MaceStatus::MACE_SUCCESS;
     } else {
       Setup(device);
       return Run();
@@ -491,14 +501,22 @@ class OpsTestNet {
 
   MaceStatus RunNet(const NetDef &net_def, const DeviceType device) {
     device_type_ = device;
-    net_ = CreateNet(op_registry_,
-                     net_def,
-                     &ws_,
-                     OpTestContext::Get()->GetDevice(device),
-                     NetMode::INIT);
-    MACE_RETURN_IF_ERROR(net_->Run());
-    net_ = CreateNet(op_registry_, net_def, &ws_,
-                     OpTestContext::Get()->GetDevice(device));
+    auto net = std::unique_ptr<NetBase>(new SerialNet(
+        op_def_registry_.get(),
+        op_registry_.get(),
+        &net_def,
+        &ws_,
+        OpTestContext::Get()->GetDevice(device),
+        NetMode::INIT));
+    MACE_RETURN_IF_ERROR(net->Init());
+    MACE_RETURN_IF_ERROR(net->Run());
+    net_ = std::unique_ptr<NetBase>(new SerialNet(
+        op_def_registry_.get(),
+        op_registry_.get(),
+        &net_def,
+        &ws_,
+        OpTestContext::Get()->GetDevice(device)));
+    MACE_RETURN_IF_ERROR(net_->Init());
     return net_->Run();
   }
 
@@ -520,7 +538,8 @@ class OpsTestNet {
   }
 
  public:
-  std::shared_ptr<OperatorRegistryBase> op_registry_;
+  std::shared_ptr<OpDefRegistryBase> op_def_registry_;
+  std::shared_ptr<OpRegistryBase> op_registry_;
   Workspace ws_;
   std::vector<OperatorDef> op_defs_;
   std::unique_ptr<NetBase> net_;
diff --git a/mace/ops/pad.cc b/mace/ops/pad.cc
deleted file mode 100644
index e6d468b2..00000000
--- a/mace/ops/pad.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/pad.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Pad(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         PadOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         PadOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pad")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         PadOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/pad.h b/mace/ops/pad.h
deleted file mode 100644
index 6a7ce102..00000000
--- a/mace/ops/pad.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_PAD_H_
-#define MACE_OPS_PAD_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/pad.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class PadOp : public Operator<D, T> {
- public:
-  PadOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 OperatorBase::GetRepeatedArgs<int>("paddings"),
-                 OperatorBase::GetOptionalArg<float>("constant_value", 0.0)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input_tensor = this->Input(0);
-    Tensor *output_tensor = this->Output(0);
-    return functor_(input_tensor, output_tensor, future);
-  }
-
- private:
-  kernels::PadFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_PAD_H_
diff --git a/mace/ops/pad_benchmark.cc b/mace/ops/pad_benchmark.cc
index c5172f8c..ad8a1254 100644
--- a/mace/ops/pad_benchmark.cc
+++ b/mace/ops/pad_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/pad_test.cc b/mace/ops/pad_test.cc
index 3a68248e..a8c2267f 100644
--- a/mace/ops/pad_test.cc
+++ b/mace/ops/pad_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
deleted file mode 100644
index b16fd261..00000000
--- a/mace/ops/pooling.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/pooling.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Pooling(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         PoolingOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         PoolingOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         PoolingOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Pooling")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         PoolingOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
deleted file mode 100644
index 3d1753b3..00000000
--- a/mace/ops/pooling.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_POOLING_H_
-#define MACE_OPS_POOLING_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/pooling.h"
-#include "mace/ops/conv_pool_2d_base.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class PoolingOp : public ConvPool2dOpBase<D, T> {
- public:
-  PoolingOp(const OperatorDef &op_def, OpKernelContext *context)
-      : ConvPool2dOpBase<D, T>(op_def, context),
-        kernels_(OperatorBase::GetRepeatedArgs<int>("kernels")),
-        pooling_type_(
-            static_cast<PoolingType>(OperatorBase::GetOptionalArg<int>(
-                "pooling_type", static_cast<int>(AVG)))),
-        functor_(context,
-                 pooling_type_,
-                 kernels_.data(),
-                 this->strides_.data(),
-                 this->padding_type_,
-                 this->paddings_,
-                 this->dilations_.data()) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input, output, future);
-  };
-
- protected:
-  std::vector<int> kernels_;
-  PoolingType pooling_type_;
-  kernels::PoolingFunctor<D, T> functor_;
-
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_POOLING_H_
diff --git a/mace/ops/pooling_benchmark.cc b/mace/ops/pooling_benchmark.cc
index 36b9d607..e5199001 100644
--- a/mace/ops/pooling_benchmark.cc
+++ b/mace/ops/pooling_benchmark.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/kernels/pooling.h"
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/pooling.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index c22e9b13..4cd432d5 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -14,11 +14,10 @@
 
 #include "gtest/gtest.h"
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/kernels/pooling.h"
-#include "mace/ops/conv_pool_2d_base.h"
+#include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/kernels/quantize.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/proposal.cc b/mace/ops/proposal.cc
deleted file mode 100644
index 2b75eeaf..00000000
--- a/mace/ops/proposal.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/proposal.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Proposal(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Proposal")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ProposalOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/proposal.h b/mace/ops/proposal.h
deleted file mode 100644
index d879e240..00000000
--- a/mace/ops/proposal.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_PROPOSAL_H_
-#define MACE_OPS_PROPOSAL_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/proposal.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class ProposalOp : public Operator<D, T> {
- public:
-  ProposalOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("min_size", 16),
-                 OperatorBase::GetOptionalArg<float>("nms_thresh", 0.7),
-                 OperatorBase::GetOptionalArg<int>("pre_nms_top_n", 6000),
-                 OperatorBase::GetOptionalArg<int>("post_nms_top_n", 300),
-                 OperatorBase::GetOptionalArg<int>("feat_stride", 0),
-                 OperatorBase::GetOptionalArg<int>("base_size", 12),
-                 OperatorBase::GetRepeatedArgs<int>("scales"),
-                 OperatorBase::GetRepeatedArgs<float>("ratios")) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *rpn_cls_prob = this->Input(RPN_CLS_PROB);
-    const Tensor *rpn_bbox_pred = this->Input(RPN_BBOX_PRED);
-    const Tensor *img_info = this->Input(IMG_INFO);
-
-    Tensor *output = this->Output(ROIS);
-
-    return functor_(rpn_cls_prob, rpn_bbox_pred, img_info, output, future);
-  }
-
- private:
-  kernels::ProposalFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(RPN_CLS_PROB, RPN_BBOX_PRED, IMG_INFO);
-  MACE_OP_OUTPUT_TAGS(ROIS);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  //  MACE_OPS_PROPOSAL_H_
diff --git a/mace/ops/proposal_test.cc b/mace/ops/proposal_test.cc
deleted file mode 100644
index e8b2ae5a..00000000
--- a/mace/ops/proposal_test.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/core/operator.h"
-#include "mace/ops/ops_test_util.h"
-
-namespace mace {
-namespace ops {
-namespace test {
-
-class ProposalOpTest : public OpsTestBase {};
-
-TEST_F(ProposalOpTest, CPUSimple) {
-  const int img_height = 256;
-  const int img_width = 256;
-  const int height = 3;
-  const int width = 4;
-
-  OpsTestNet net;
-
-  OpDefBuilder("Proposal", "ProposalTest")
-      .Input("RpnCLSProb")
-      .Input("RpnBBoxPred")
-      .Input("ImgInfo")
-      .AddIntArg("min_size", 16)
-      .AddFloatArg("nms_thresh", 0.7)
-      .AddIntArg("pre_nms_top_n", 12000)
-      .AddIntArg("post_nms_top_n", 2000)
-      .AddIntArg("feat_stride", 16)
-      .AddIntArg("base_size", 16)
-      .AddIntsArg("scales", {8, 16, 32})
-      .AddFloatsArg("ratios", {0.5, 1, 2})
-      .Output("Output")
-      .Finalize(net.NewOperatorDef());
-
-  std::vector<float> scores(height * width * 18);
-  for (size_t i = 0; i < scores.size(); ++i) {
-    scores[i] = i;
-  }
-
-  // Add input data
-  net.AddInputFromArray<DeviceType::CPU, float>("RpnCLSProb",
-                                                {1, height, width, 18}, scores);
-  net.AddRepeatedInput<DeviceType::CPU, float>("RpnBBoxPred",
-                                               {1, height, width, 4 * 9}, 1);
-  net.AddInputFromArray<DeviceType::CPU, float>("ImgInfo", {1, 1, 1, 3},
-                                                {img_height, img_width, 2});
-
-  // Run
-  net.RunOp();
-
-  auto expected_tensor = net.CreateTensor<float>({1, 1, 1, 5},
-                                                 {0, 0, 0, 255, 255});
-
-  ExpectTensorNear<float>(*expected_tensor, *net.GetTensor("Output"), 1e-5);
-}
-
-}  // namespace test
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/quantize.cc b/mace/ops/quantize.cc
deleted file mode 100644
index 35f61ac9..00000000
--- a/mace/ops/quantize.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/quantize.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Quantize(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Quantize")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         QuantizeOp<DeviceType::CPU, uint8_t>);
-}
-
-void Register_Dequantize(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Dequantize")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         DequantizeOp<DeviceType::CPU, uint8_t>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/quantize.h b/mace/ops/quantize.h
deleted file mode 100644
index 2e7a77c2..00000000
--- a/mace/ops/quantize.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_QUANTIZE_H_
-#define MACE_OPS_QUANTIZE_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/quantize.h"
-
-namespace mace {
-namespace ops {
-
-template<DeviceType D, class T>
-class QuantizeOp : public Operator<D, T> {
- public:
-  QuantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context),
-        non_zero_(
-            static_cast<bool>(OperatorBase::GetOptionalArg<int>("non_zero",
-                                                                0))) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-    return functor_(input, non_zero_, output, future);
-  }
-
- private:
-  kernels::QuantizeFunctor<D, T> functor_;
-  bool non_zero_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-template<DeviceType D, class T>
-class DequantizeOp : public Operator<D, T> {
- public:
-  DequantizeOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context), functor_(context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(input));
-
-    return functor_(input, output, future);
-  }
-
- private:
-  kernels::DequantizeFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_QUANTIZE_H_
diff --git a/mace/ops/quantize_test.cc b/mace/ops/quantize_test.cc
index 5f9fd0d8..207ab4e4 100644
--- a/mace/ops/quantize_test.cc
+++ b/mace/ops/quantize_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/reduce_mean.cc b/mace/ops/reduce_mean.cc
deleted file mode 100644
index ee4d1716..00000000
--- a/mace/ops/reduce_mean.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/reduce_mean.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ReduceMean(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReduceMean")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    ReduceMeanOp<DeviceType::CPU, float>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReduceMean")
-                                     .Device(DeviceType::GPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    ReduceMeanOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ReduceMean")
-                                     .Device(DeviceType::GPU)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    ReduceMeanOp<DeviceType::GPU, half>);
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/reduce_mean.h b/mace/ops/reduce_mean.h
deleted file mode 100644
index 0ef9c102..00000000
--- a/mace/ops/reduce_mean.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_REDUCE_MEAN_H_
-#define MACE_OPS_REDUCE_MEAN_H_
-
-#include <string>
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/reduce_mean.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class ReduceMeanOp : public Operator<D, T> {
- public:
-  ReduceMeanOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 OperatorBase::GetRepeatedArgs<int>("axis"),
-                 OperatorBase::GetOptionalArg<bool>("keepdims", false)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const std::vector<int> axis =
-        OperatorBase::GetRepeatedArgs<int>("axis");
-    const int left = static_cast<int>(input->dim_size() * -1);
-    const int right = static_cast<int>(input->dim_size());
-    if (axis.size()) {
-      for (unsigned int i = 0; i < axis.size(); ++i) {
-        MACE_CHECK(axis[i] > left && axis[i] < right, "Axis is over range.");
-      }
-    }
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input, output, future);
-  }
-
- private:
-  kernels::ReduceMeanFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_REDUCE_MEAN_H_
diff --git a/mace/ops/reduce_mean_benchmark.cc b/mace/ops/reduce_mean_benchmark.cc
index 3591c9b1..02f6d447 100644
--- a/mace/ops/reduce_mean_benchmark.cc
+++ b/mace/ops/reduce_mean_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/reduce_mean_test.cc b/mace/ops/reduce_mean_test.cc
index b1bbe5cc..24ff7a4a 100644
--- a/mace/ops/reduce_mean_test.cc
+++ b/mace/ops/reduce_mean_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/reshape.cc b/mace/ops/reshape.cc
deleted file mode 100644
index 2831aeba..00000000
--- a/mace/ops/reshape.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/reshape.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Reshape(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ReshapeOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         ReshapeOp<DeviceType::CPU, int32_t>);
-
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ReshapeOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reshape")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         ReshapeOp<DeviceType::GPU, half>);
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/reshape_test.cc b/mace/ops/reshape_test.cc
index 947e968b..bdc7ab97 100644
--- a/mace/ops/reshape_test.cc
+++ b/mace/ops/reshape_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/resize_bicubic.cc b/mace/ops/resize_bicubic.cc
deleted file mode 100644
index 7a50522f..00000000
--- a/mace/ops/resize_bicubic.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/resize_bicubic.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ResizeBicubic(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBicubic")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ResizeBicubicOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBicubic")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ResizeBicubicOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBicubic")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         ResizeBicubicOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/resize_bicubic.h b/mace/ops/resize_bicubic.h
deleted file mode 100644
index df9fc11c..00000000
--- a/mace/ops/resize_bicubic.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_RESIZE_BICUBIC_H_
-#define MACE_OPS_RESIZE_BICUBIC_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/resize_bicubic.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class ResizeBicubicOp : public Operator<D, T> {
- public:
-  ResizeBicubicOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<bool>("align_corners", false),
-                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1})) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(0);
-    Tensor *output = this->Output(0);
-
-    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
-               input->dim_size());
-
-    return functor_(input, output, future);
-  }
-
- private:
-  kernels::ResizeBicubicFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_RESIZE_BICUBIC_H_
-
diff --git a/mace/ops/resize_bicubic_benchmark.cc b/mace/ops/resize_bicubic_benchmark.cc
index ba22f4fe..f0847e4c 100644
--- a/mace/ops/resize_bicubic_benchmark.cc
+++ b/mace/ops/resize_bicubic_benchmark.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <string>
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/resize_bicubic_test.cc b/mace/ops/resize_bicubic_test.cc
index 97da0480..8dc1dbf7 100644
--- a/mace/ops/resize_bicubic_test.cc
+++ b/mace/ops/resize_bicubic_test.cc
@@ -14,9 +14,8 @@
 
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/resize_bicubic.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/resize_bilinear.cc b/mace/ops/resize_bilinear.cc
deleted file mode 100644
index 31062569..00000000
--- a/mace/ops/resize_bilinear.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/resize_bilinear.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ResizeBilinear(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ResizeBilinearOp<DeviceType::CPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
-                                         .Device(DeviceType::CPU)
-                                         .TypeConstraint<uint8_t>("T")
-                                         .Build(),
-                         ResizeBilinearOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ResizeBilinearOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ResizeBilinear")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         ResizeBilinearOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/resize_bilinear.h b/mace/ops/resize_bilinear.h
deleted file mode 100644
index f328a9a4..00000000
--- a/mace/ops/resize_bilinear.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_RESIZE_BILINEAR_H_
-#define MACE_OPS_RESIZE_BILINEAR_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/resize_bilinear.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class ResizeBilinearOp : public Operator<D, T> {
- public:
-  ResizeBilinearOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 OperatorBase::GetRepeatedArgs<index_t>("size", {-1, -1}),
-                 OperatorBase::GetOptionalArg<bool>("align_corners", false)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(0);
-    Tensor *output = this->Output(0);
-
-    MACE_CHECK(input->dim_size() == 4, "input must be 4-dimensional.",
-               input->dim_size());
-
-    return functor_(input, output, future);
-  }
-
- private:
-  kernels::ResizeBilinearFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_RESIZE_BILINEAR_H_
diff --git a/mace/ops/resize_bilinear_benchmark.cc b/mace/ops/resize_bilinear_benchmark.cc
index 993d7269..2fd6b6c2 100644
--- a/mace/ops/resize_bilinear_benchmark.cc
+++ b/mace/ops/resize_bilinear_benchmark.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <string>
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/resize_bilinear_test.cc b/mace/ops/resize_bilinear_test.cc
index c628bd9b..3ff5372a 100644
--- a/mace/ops/resize_bilinear_test.cc
+++ b/mace/ops/resize_bilinear_test.cc
@@ -14,9 +14,8 @@
 
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/resize_bilinear.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/reverse.cc b/mace/ops/reverse.cc
deleted file mode 100644
index 4660fba7..00000000
--- a/mace/ops/reverse.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/reverse.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Reverse(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Reverse")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ReverseOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/reverse.h b/mace/ops/reverse.h
deleted file mode 100644
index a753a4e2..00000000
--- a/mace/ops/reverse.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_REVERSE_H_
-#define MACE_OPS_REVERSE_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/reverse.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class ReverseOp : public Operator<D, T> {
- public:
-  ReverseOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context), functor_(context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *axis = this->Input(AXIS);
-    Tensor *output = this->Output(OUTPUT);
-    return functor_(input, axis, output, future);
-  }
-
- private:
-  kernels::ReverseFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, AXIS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_REVERSE_H_
diff --git a/mace/ops/reverse_benchmark.cc b/mace/ops/reverse_benchmark.cc
index c6352fab..40f2f908 100644
--- a/mace/ops/reverse_benchmark.cc
+++ b/mace/ops/reverse_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/reverse_test.cc b/mace/ops/reverse_test.cc
index afa17e50..282214fd 100644
--- a/mace/ops/reverse_test.cc
+++ b/mace/ops/reverse_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc
deleted file mode 100644
index 82ef3eb3..00000000
--- a/mace/ops/scalar_math.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/scalar_math.h"
-
-namespace mace {
-namespace ops {
-
-void Register_ScalarMath(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ScalarMathOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         ScalarMathOp<DeviceType::CPU, int32_t>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ScalarMathOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         ScalarMathOp<DeviceType::GPU, int32_t>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/scalar_math.h b/mace/ops/scalar_math.h
deleted file mode 100644
index 356c9371..00000000
--- a/mace/ops/scalar_math.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_SCALAR_MATH_H_
-#define MACE_OPS_SCALAR_MATH_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/scalar_math.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class ScalarMathOp : public Operator<D, T> {
- public:
-  ScalarMathOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 static_cast<kernels::EltwiseType>(
-                   OperatorBase::GetOptionalArg<int>(
-                       "type", static_cast<int>(kernels::EltwiseType::NONE))),
-                 OperatorBase::GetRepeatedArgs<float>("coeff"),
-                 OperatorBase::GetOptionalArg<float>("scalar_input", 1.0),
-                 OperatorBase::GetOptionalArg<int32_t>(
-                     "scalar_input_index", 1)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const std::vector<const Tensor *> input_list = this->Inputs();
-    Tensor *output = this->Output(0);
-    return functor_(input_list, output, future);
-  }
-
- private:
-  kernels::ScalarMathFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SCALAR_MATH_H_
diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc
index 0d34b80a..99caa07d 100644
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 #include "mace/kernels/eltwise.h"
 
diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc
deleted file mode 100644
index 6815496f..00000000
--- a/mace/ops/shape.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/shape.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Shape(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ShapeOp<DeviceType::CPU, float>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         ShapeOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Shape")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         ShapeOp<DeviceType::GPU, half>);
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/shape_test.cc b/mace/ops/shape_test.cc
index 08ccb88b..2b66c7eb 100644
--- a/mace/ops/shape_test.cc
+++ b/mace/ops/shape_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/softmax.cc b/mace/ops/softmax.cc
deleted file mode 100644
index 64586329..00000000
--- a/mace/ops/softmax.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/softmax.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Softmax(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SoftmaxOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         SoftmaxOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SoftmaxOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Softmax")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         SoftmaxOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/softmax.h b/mace/ops/softmax.h
deleted file mode 100644
index 047402f0..00000000
--- a/mace/ops/softmax.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_SOFTMAX_H_
-#define MACE_OPS_SOFTMAX_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/softmax.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class SoftmaxOp : public Operator<D, T> {
- public:
-  SoftmaxOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *logits = this->Input(LOGITS);
-
-    Tensor *output = this->Output(OUTPUT);
-    MACE_RETURN_IF_ERROR(output->ResizeLike(logits));
-
-    return functor_(logits, output, future);
-  }
-
- private:
-  kernels::SoftmaxFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(LOGITS);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SOFTMAX_H_
diff --git a/mace/ops/softmax_benchmark.cc b/mace/ops/softmax_benchmark.cc
index 009d1aae..482709ad 100644
--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -14,7 +14,7 @@
 
 #include <string>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/softmax_test.cc b/mace/ops/softmax_test.cc
index 012424c5..98b0ad97 100644
--- a/mace/ops/softmax_test.cc
+++ b/mace/ops/softmax_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/space_to_batch.cc b/mace/ops/space_to_batch.cc
deleted file mode 100644
index 29dbed97..00000000
--- a/mace/ops/space_to_batch.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/space_to_batch.h"
-
-namespace mace {
-namespace ops {
-
-void Register_SpaceToBatchND(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SpaceToBatchNDOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         SpaceToBatchNDOp<DeviceType::CPU, uint8_t>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SpaceToBatchNDOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToBatchND")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         SpaceToBatchNDOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/space_to_batch.h b/mace/ops/space_to_batch.h
deleted file mode 100644
index fabd7bb2..00000000
--- a/mace/ops/space_to_batch.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_SPACE_TO_BATCH_H_
-#define MACE_OPS_SPACE_TO_BATCH_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/space_to_batch.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class SpaceToBatchNDOp : public Operator<D, T> {
- public:
-  SpaceToBatchNDOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 OperatorBase::GetRepeatedArgs<int>("paddings", {0, 0, 0, 0}),
-                 OperatorBase::GetRepeatedArgs<int>("block_shape", {1, 1})) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *space_tensor = this->Input(INPUT);
-    Tensor *batch_tensor = this->Output(OUTPUT);
-    return functor_(space_tensor, batch_tensor, future);
-  }
-
- private:
-  kernels::SpaceToBatchFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SPACE_TO_BATCH_H_
diff --git a/mace/ops/space_to_batch_benchmark.cc b/mace/ops/space_to_batch_benchmark.cc
index faff487a..565ad5dc 100644
--- a/mace/ops/space_to_batch_benchmark.cc
+++ b/mace/ops/space_to_batch_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/space_to_depth.cc b/mace/ops/space_to_depth.cc
deleted file mode 100644
index 67b520f6..00000000
--- a/mace/ops/space_to_depth.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/space_to_depth.h"
-
-namespace mace {
-namespace ops {
-
-void Register_SpaceToDepth(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SpaceToDepthOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SpaceToDepthOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SpaceToDepth")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         SpaceToDepthOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/space_to_depth.h b/mace/ops/space_to_depth.h
deleted file mode 100644
index 6d078e2f..00000000
--- a/mace/ops/space_to_depth.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_SPACE_TO_DEPTH_H_
-#define MACE_OPS_SPACE_TO_DEPTH_H_
-
-#include <memory>
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/space_to_depth.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class SpaceToDepthOp : public Operator<D, T> {
- public:
-  SpaceToDepthOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("block_size", 1)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-    MACE_CHECK(input->dim_size() == 4, "input dim should be 4");
-    return functor_(input, output, future);
-  }
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-
- private:
-  kernels::SpaceToDepthOpFunctor<D, T> functor_;
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SPACE_TO_DEPTH_H_
diff --git a/mace/ops/space_to_depth_benchmark.cc b/mace/ops/space_to_depth_benchmark.cc
index 97d3cb03..480a0421 100644
--- a/mace/ops/space_to_depth_benchmark.cc
+++ b/mace/ops/space_to_depth_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/space_to_depth_test.cc b/mace/ops/space_to_depth_test.cc
index c1168a65..76569492 100644
--- a/mace/ops/space_to_depth_test.cc
+++ b/mace/ops/space_to_depth_test.cc
@@ -15,7 +15,7 @@
 #include <fstream>
 
 #include <vector>
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/split.cc b/mace/ops/split.cc
deleted file mode 100644
index e5e103d7..00000000
--- a/mace/ops/split.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/split.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Split(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Split")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SplitOp<DeviceType::CPU, float>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Split")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SplitOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Split")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         SplitOp<DeviceType::GPU, half>);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/split.h b/mace/ops/split.h
deleted file mode 100644
index aa41aa15..00000000
--- a/mace/ops/split.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_SPLIT_H_
-#define MACE_OPS_SPLIT_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/split.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class SplitOp : public Operator<D, T> {
- public:
-  SplitOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 3)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    MACE_CHECK(this->OutputSize() >= 2)
-        << "There must be at least two outputs for slicing";
-    const Tensor *input = this->Input(INPUT);
-    const std::vector<Tensor *> output_list = this->Outputs();
-    const int32_t split_axis = OperatorBase::GetOptionalArg<int>("axis", 3);
-    MACE_CHECK((input->dim(split_axis) % this->OutputSize()) == 0)
-        << "Outputs do not split input equally.";
-
-    return functor_(input, output_list, future);
-  }
-
- private:
-  kernels::SplitFunctor<D, T> functor_;
-
- private:
-  MACE_OP_INPUT_TAGS(INPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SPLIT_H_
diff --git a/mace/ops/split_benchmark.cc b/mace/ops/split_benchmark.cc
index 8dea1263..aa0e8fba 100644
--- a/mace/ops/split_benchmark.cc
+++ b/mace/ops/split_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/split_test.cc b/mace/ops/split_test.cc
index 57544d18..d42b3716 100644
--- a/mace/ops/split_test.cc
+++ b/mace/ops/split_test.cc
@@ -17,7 +17,6 @@
 
 #include "gmock/gmock.h"
 #include "mace/ops/ops_test_util.h"
-#include "mace/ops/split.h"
 
 namespace mace {
 namespace ops {
diff --git a/mace/ops/sqrdiff_mean.cc b/mace/ops/sqrdiff_mean.cc
deleted file mode 100644
index d8e8bd51..00000000
--- a/mace/ops/sqrdiff_mean.cc
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/sqrdiff_mean.h"
-
-namespace mace {
-namespace ops {
-
-void Register_SqrDiffMean(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SqrDiffMean")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                         SqrDiffMeanOp<DeviceType::CPU, float>);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SqrDiffMean")
-                                     .Device(DeviceType::GPU)
-                                     .TypeConstraint<float>("T")
-                                     .Build(),
-                    SqrDiffMeanOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("SqrDiffMean")
-                                     .Device(DeviceType::GPU)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    SqrDiffMeanOp<DeviceType::GPU, half>);
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/sqrdiff_mean.h b/mace/ops/sqrdiff_mean.h
deleted file mode 100644
index f021c0b2..00000000
--- a/mace/ops/sqrdiff_mean.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_SQRDIFF_MEAN_H_
-#define MACE_OPS_SQRDIFF_MEAN_H_
-
-#include <string>
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/sqrdiff_mean.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class SqrDiffMeanOp : public Operator<D, T> {
- public:
-  SqrDiffMeanOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input0 = this->Input(INPUT0);
-    const Tensor *input1 = this->Input(INPUT1);
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input0, input1, output, future);
-  }
-
- private:
-  kernels::SqrDiffMeanFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT0, INPUT1);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_SQRDIFF_MEAN_H_
diff --git a/mace/ops/squeeze.cc b/mace/ops/squeeze.cc
deleted file mode 100644
index eac886dd..00000000
--- a/mace/ops/squeeze.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/squeeze.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Squeeze(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SqueezeOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<uint8_t>("T")
-                                          .Build(),
-                         SqueezeOp<DeviceType::CPU, uint8_t>);
-
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         SqueezeOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Squeeze")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         SqueezeOp<DeviceType::GPU, half>);
-#endif
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/squeeze_test.cc b/mace/ops/squeeze_test.cc
index fba5a37d..166d9868 100644
--- a/mace/ops/squeeze_test.cc
+++ b/mace/ops/squeeze_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gmock/gmock.h"
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc
deleted file mode 100644
index 7aa7c07e..00000000
--- a/mace/ops/stack.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/stack.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Stack(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         StackOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         StackOp<DeviceType::CPU, int32_t>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         StackOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         StackOp<DeviceType::GPU, int32_t>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/stack.h b/mace/ops/stack.h
deleted file mode 100644
index be25c0b0..00000000
--- a/mace/ops/stack.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_STACK_H_
-#define MACE_OPS_STACK_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/stack.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class StackOp : public Operator<D, T> {
- public:
-  StackOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const std::vector<const Tensor *> &inputs = this->Inputs();
-    Tensor *output = this->Output(OUTPUT);
-    return functor_(inputs, output, future);
-  }
-
- private:
-  kernels::StackFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_STACK_H_
diff --git a/mace/ops/stack_test.cc b/mace/ops/stack_test.cc
index 8cccb133..e55ff278 100644
--- a/mace/ops/stack_test.cc
+++ b/mace/ops/stack_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc
deleted file mode 100644
index 0f608b17..00000000
--- a/mace/ops/strided_slice.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/strided_slice.h"
-
-namespace mace {
-namespace ops {
-
-void Register_StridedSlice(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         StridedSliceOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         StridedSliceOp<DeviceType::CPU, int32_t>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         StridedSliceOp<DeviceType::GPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         StridedSliceOp<DeviceType::GPU, int32_t>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/strided_slice.h b/mace/ops/strided_slice.h
deleted file mode 100644
index 249dc3e9..00000000
--- a/mace/ops/strided_slice.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_STRIDED_SLICE_H_
-#define MACE_OPS_STRIDED_SLICE_H_
-
-#include "mace/core/operator.h"
-#include "mace/kernels/strided_slice.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class StridedSliceOp : public Operator<D, T> {
- public:
-  StridedSliceOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context,
-                 OperatorBase::GetOptionalArg<int>("begin_mask", 0),
-                 OperatorBase::GetOptionalArg<int>("end_mask", 0),
-                 OperatorBase::GetOptionalArg<int>("ellipsis_mask", 0),
-                 OperatorBase::GetOptionalArg<int>("new_axis_mask", 0),
-                 OperatorBase::GetOptionalArg<int>("shrink_axis_mask", 0),
-                 OperatorBase::GetOptionalArg<bool>("slice",
-                                                    false)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const Tensor *begin_indices = this->Input(BEGIN);
-    const Tensor *end_indices = this->Input(END);
-    const Tensor *strides = nullptr;
-    if (this->InputSize() > 3) {
-      strides = this->Input(STRIDES);
-    }
-    Tensor *output = this->Output(OUTPUT);
-
-    return functor_(input, begin_indices, end_indices, strides, output, future);
-  }
-
- private:
-  kernels::StridedSliceFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT, BEGIN, END, STRIDES);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_STRIDED_SLICE_H_
diff --git a/mace/ops/strided_slice_test.cc b/mace/ops/strided_slice_test.cc
index d975d7be..c13a813c 100644
--- a/mace/ops/strided_slice_test.cc
+++ b/mace/ops/strided_slice_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/transpose.cc b/mace/ops/transpose.cc
deleted file mode 100644
index 73dcaf7b..00000000
--- a/mace/ops/transpose.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/transpose.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Transpose(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Transpose")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         TransposeOp<DeviceType::CPU, float>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/transpose.h b/mace/ops/transpose.h
deleted file mode 100644
index 91aa3365..00000000
--- a/mace/ops/transpose.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_TRANSPOSE_H_
-#define MACE_OPS_TRANSPOSE_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/softmax.h"
-#include "mace/kernels/transpose.h"
-
-namespace mace {
-
-template <DeviceType D, class T>
-class TransposeOp : public Operator<D, T> {
- public:
-  TransposeOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        dims_(OperatorBase::GetRepeatedArgs<int>("dims")),
-        functor_(context, dims_) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    Tensor *output = this->Output(OUTPUT);
-    const std::vector<index_t> &input_shape = input->shape();
-    MACE_CHECK((input_shape.size() == 4 && dims_.size() == 4) ||
-                   (input_shape.size() == 2 && dims_.size() == 2),
-               "rank should be 2 or 4");
-    std::vector<index_t> output_shape;
-    for (size_t i = 0; i < dims_.size(); ++i) {
-      output_shape.push_back(input_shape[dims_[i]]);
-    }
-    MACE_RETURN_IF_ERROR(output->Resize(output_shape));
-    return functor_(input, output, future);
-  }
-
- protected:
-  std::vector<int> dims_;
-  kernels::TransposeFunctor<D, T> functor_;
-
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace mace
-
-#endif  // MACE_OPS_TRANSPOSE_H_
diff --git a/mace/ops/transpose_benchmark.cc b/mace/ops/transpose_benchmark.cc
index 1e68a4a9..6d37b93c 100644
--- a/mace/ops/transpose_benchmark.cc
+++ b/mace/ops/transpose_benchmark.cc
@@ -15,7 +15,7 @@
 #include <string>
 #include <vector>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/ops/transpose_test.cc b/mace/ops/transpose_test.cc
index 76bfc57a..44ef0ec2 100644
--- a/mace/ops/transpose_test.cc
+++ b/mace/ops/transpose_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/unstack.cc b/mace/ops/unstack.cc
deleted file mode 100644
index 7b1c815b..00000000
--- a/mace/ops/unstack.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/unstack.h"
-
-namespace mace {
-namespace ops {
-
-void Register_Unstack(OperatorRegistryBase *op_registry) {
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Unstack")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         UnstackOp<DeviceType::CPU, float>);
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Unstack")
-                                          .Device(DeviceType::CPU)
-                                          .TypeConstraint<int32_t>("T")
-                                          .Build(),
-                         UnstackOp<DeviceType::CPU, int32_t>);
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/unstack.h b/mace/ops/unstack.h
deleted file mode 100644
index 1c3d1764..00000000
--- a/mace/ops/unstack.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_UNSTACK_H_
-#define MACE_OPS_UNSTACK_H_
-
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/unstack.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, class T>
-class UnstackOp : public Operator<D, T> {
- public:
-  UnstackOp(const OperatorDef &operator_def, OpKernelContext *context)
-      : Operator<D, T>(operator_def, context),
-        functor_(context, OperatorBase::GetOptionalArg<int>("axis", 0)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input = this->Input(INPUT);
-    const std::vector<Tensor *> outputs = this->Outputs();
-    return functor_(input, outputs, future);
-  }
-
- private:
-  kernels::UnstackFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_OUTPUT_TAGS(INPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_UNSTACK_H_
diff --git a/mace/ops/unstack_test.cc b/mace/ops/unstack_test.cc
index 306c8362..4c9774ff 100644
--- a/mace/ops/unstack_test.cc
+++ b/mace/ops/unstack_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/ops/ops_test_util.h"
 
 namespace mace {
diff --git a/mace/ops/winograd_convolution_benchmark.cc b/mace/ops/winograd_convolution_benchmark.cc
index c616a280..3b126f07 100644
--- a/mace/ops/winograd_convolution_benchmark.cc
+++ b/mace/ops/winograd_convolution_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
index 3cd5ab92..1c82a189 100644
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -14,7 +14,7 @@
 
 #include <fstream>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -22,7 +22,7 @@ namespace mace {
 namespace ops {
 namespace test {
 
-class WinogradConvlutionTest : public OpsTestBase {};
+class WinogradConvolutionTest : public OpsTestBase {};
 
 namespace {
 
@@ -134,42 +134,42 @@ void WinogradConvolution(const index_t batch,
 }
 }  // namespace
 
-TEST_F(WinogradConvlutionTest, AlignedConvolutionM2) {
+TEST_F(WinogradConvolutionTest, AlignedConvolutionM2) {
   WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
                                                  Padding::VALID, 2);
   WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
                                                  Padding::SAME, 2);
 }
 
-TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2) {
+TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2) {
   WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
                                                  Padding::VALID, 2);
   WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
                                                  Padding::SAME, 2);
 }
 
-TEST_F(WinogradConvlutionTest, BatchConvolutionM2) {
+TEST_F(WinogradConvolutionTest, BatchConvolutionM2) {
   WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
                                                  Padding::VALID, 2);
   WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
                                                  Padding::SAME, 2);
 }
 
-TEST_F(WinogradConvlutionTest, AlignedConvolutionM4) {
+TEST_F(WinogradConvolutionTest, AlignedConvolutionM4) {
   WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
                                               Padding::VALID, 4);
   WinogradConvolution<DeviceType::GPU, float>(1, 32, 32, 3, 3,
                                               Padding::SAME, 4);
 }
 
-TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4) {
+TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4) {
   WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 31, 37,
                                               Padding::VALID, 4);
   WinogradConvolution<DeviceType::GPU, float>(1, 61, 67, 37, 31,
                                               Padding::SAME, 4);
 }
 
-TEST_F(WinogradConvlutionTest, BatchConvolutionM4) {
+TEST_F(WinogradConvolutionTest, BatchConvolutionM4) {
   WinogradConvolution<DeviceType::GPU, float>(3, 64, 64, 32, 32,
                                               Padding::VALID, 4);
   WinogradConvolution<DeviceType::GPU, float>(5, 61, 67, 37, 31,
@@ -284,42 +284,42 @@ void WinogradConvolutionWithPad(const index_t batch,
 }
 }  // namespace
 
-TEST_F(WinogradConvlutionTest, AlignedConvolutionM2WithPad) {
+TEST_F(WinogradConvolutionTest, AlignedConvolutionM2WithPad) {
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
                                                      1, 2);
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
                                                     2, 2);
 }
 
-TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM2WithPad) {
+TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM2WithPad) {
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
                                                      1, 2);
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 37, 31,
                                                     2, 2);
 }
 
-TEST_F(WinogradConvlutionTest, BatchConvolutionWithM2Pad) {
+TEST_F(WinogradConvolutionTest, BatchConvolutionWithM2Pad) {
   WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
                                                      1, 2);
   WinogradConvolutionWithPad<DeviceType::GPU, float>(5, 61, 67, 37, 31,
                                                     2, 2);
 }
 
-TEST_F(WinogradConvlutionTest, AlignedConvolutionM4WithPad) {
+TEST_F(WinogradConvolutionTest, AlignedConvolutionM4WithPad) {
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
                                                      1, 4);
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 32, 32, 32, 16,
                                                     2, 4);
 }
 
-TEST_F(WinogradConvlutionTest, UnAlignedConvolutionM4WithPad) {
+TEST_F(WinogradConvolutionTest, UnAlignedConvolutionM4WithPad) {
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 31, 37,
                                                      1, 4);
   WinogradConvolutionWithPad<DeviceType::GPU, float>(1, 61, 67, 37, 31,
                                                     2, 4);
 }
 
-TEST_F(WinogradConvlutionTest, BatchConvolutionWithM4Pad) {
+TEST_F(WinogradConvolutionTest, BatchConvolutionWithM4Pad) {
   WinogradConvolutionWithPad<DeviceType::GPU, float>(3, 64, 64, 32, 32,
                                                      1, 4);
   WinogradConvolutionWithPad<DeviceType::GPU, float>(5, 61, 67, 37, 31,
diff --git a/mace/ops/winograd_inverse_transform.cc b/mace/ops/winograd_inverse_transform.cc
deleted file mode 100644
index 62e86248..00000000
--- a/mace/ops/winograd_inverse_transform.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/winograd_inverse_transform.h"
-
-namespace mace {
-namespace ops {
-
-void Register_WinogradInverseTransform(OperatorRegistryBase *op_registry) {
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         WinogradInverseTransformOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradInverseTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         WinogradInverseTransformOp<DeviceType::GPU, half>);
-#else
-  MACE_UNUSED(op_registry);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h
deleted file mode 100644
index 548c889a..00000000
--- a/mace/ops/winograd_inverse_transform.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
-#define MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/activation.h"
-#include "mace/kernels/winograd_transform.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class WinogradInverseTransformOp : public Operator<D, T> {
- public:
-  WinogradInverseTransformOp(const OperatorDef &op_def,
-                             OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 kernels::StringToActivationType(
-                     OperatorBase::GetOptionalArg<std::string>("activation",
-                                                               "NOOP")),
-                 OperatorBase::GetOptionalArg<float>("max_limit", 0.0f),
-                 OperatorBase::GetOptionalArg<int>("wino_block_size", 2)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const std::vector<const Tensor *> &inputs = this->Inputs();
-    Tensor *output_tensor = this->Output(OUTPUT);
-    return functor_(inputs, output_tensor, future);
-  }
-
- private:
-  kernels::WinogradInverseTransformFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_WINOGRAD_INVERSE_TRANSFORM_H_
diff --git a/mace/ops/winograd_transform.cc b/mace/ops/winograd_transform.cc
deleted file mode 100644
index a4dab0ec..00000000
--- a/mace/ops/winograd_transform.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mace/ops/winograd_transform.h"
-
-namespace mace {
-namespace ops {
-
-void Register_WinogradTransform(OperatorRegistryBase *op_registry) {
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<float>("T")
-                                          .Build(),
-                         WinogradTransformOp<DeviceType::GPU, float>);
-
-  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("WinogradTransform")
-                                          .Device(DeviceType::GPU)
-                                          .TypeConstraint<half>("T")
-                                          .Build(),
-                         WinogradTransformOp<DeviceType::GPU, half>);
-#else
-  MACE_UNUSED(op_registry);
-#endif  // MACE_ENABLE_OPENCL
-}
-
-}  // namespace ops
-}  // namespace mace
diff --git a/mace/ops/winograd_transform.h b/mace/ops/winograd_transform.h
deleted file mode 100644
index 2274b6e8..00000000
--- a/mace/ops/winograd_transform.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2018 Xiaomi, Inc.  All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MACE_OPS_WINOGRAD_TRANSFORM_H_
-#define MACE_OPS_WINOGRAD_TRANSFORM_H_
-
-#include <memory>
-
-#include "mace/core/operator.h"
-#include "mace/kernels/winograd_transform.h"
-
-namespace mace {
-namespace ops {
-
-template <DeviceType D, typename T>
-class WinogradTransformOp : public Operator<D, T> {
- public:
-  WinogradTransformOp(const OperatorDef &op_def, OpKernelContext *context)
-      : Operator<D, T>(op_def, context),
-        functor_(context,
-                 static_cast<Padding>(OperatorBase::GetOptionalArg<int>(
-                     "padding", static_cast<int>(VALID))),
-                 OperatorBase::GetRepeatedArgs<int>("padding_values"),
-                 OperatorBase::GetOptionalArg<int>(
-                     "wino_block_size", 2)) {}
-
-  MaceStatus Run(StatsFuture *future) override {
-    const Tensor *input_tensor = this->Input(INPUT);
-    Tensor *output_tensor = this->Output(OUTPUT);
-
-    return functor_(input_tensor, output_tensor, future);
-  }
-
- private:
-  kernels::WinogradTransformFunctor<D, T> functor_;
-
- protected:
-  MACE_OP_INPUT_TAGS(INPUT);
-  MACE_OP_OUTPUT_TAGS(OUTPUT);
-};
-
-}  // namespace ops
-}  // namespace mace
-
-#endif  // MACE_OPS_WINOGRAD_TRANSFORM_H_
diff --git a/mace/ops/winograd_transform_benchmark.cc b/mace/ops/winograd_transform_benchmark.cc
index 9955c9ab..5c21c9ad 100644
--- a/mace/ops/winograd_transform_benchmark.cc
+++ b/mace/ops/winograd_transform_benchmark.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
 
diff --git a/mace/proto/mace.proto b/mace/proto/mace.proto
index 11f2a88d..d4094dd9 100644
--- a/mace/proto/mace.proto
+++ b/mace/proto/mace.proto
@@ -72,10 +72,11 @@ message OperatorDef {
   repeated string output = 2;
   optional string name = 3;
   optional string type = 4;
-  repeated Argument arg = 5;
-  repeated OutputShape output_shape = 6;
-  repeated DataType output_type = 7;
-  repeated QuantizeActivationInfo quantize_info = 8;
+  optional int32 device_type = 5;
+  repeated Argument arg = 6;
+  repeated OutputShape output_shape = 7;
+  repeated DataType output_type = 8;
+  repeated QuantizeActivationInfo quantize_info = 9;
 
   repeated int32 mem_id = 10;
 
@@ -119,6 +120,7 @@ message NetDef {
   repeated OperatorDef op = 1;
   repeated Argument arg = 2;
   repeated ConstTensor tensors = 3;
+  repeated string op_types = 4;
 
   // for mem optimization
   optional MemoryArena mem_arena = 10;
diff --git a/mace/public/mace.h b/mace/public/mace.h
index 313e1afb..e9ab737a 100644
--- a/mace/public/mace.h
+++ b/mace/public/mace.h
@@ -82,17 +82,42 @@ class RunMetadata {
 
 const char *MaceVersion();
 
-enum MaceStatus {
-  MACE_SUCCESS = 0,
-  MACE_INVALID_ARGS = 1,
-  MACE_OUT_OF_RESOURCES = 2
+class MaceStatus {
+ public:
+  enum Code {
+    MACE_SUCCESS = 0,
+    MACE_INVALID_ARGS = 1,
+    MACE_OUT_OF_RESOURCES = 2
+  };
+
+ public:
+  MaceStatus();
+  MaceStatus(const Code code);  // NOLINT(runtime/explicit)
+  MaceStatus(const Code code, const std::string &information);
+  MaceStatus(const MaceStatus &);
+  MaceStatus(MaceStatus &&);
+  MaceStatus &operator=(const MaceStatus &);
+  MaceStatus &operator=(const MaceStatus &&);
+  ~MaceStatus();
+  Code code() const;
+  std::string information() const;
+
+  bool operator==(const MaceStatus &other) const;
+  bool operator!=(const MaceStatus &other) const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
 };
 
-#define MACE_RETURN_IF_ERROR(stmt)                                          \
+
+#define MACE_RETURN_IF_ERROR(stmt)                                         \
   {                                                                        \
     MaceStatus status = (stmt);                                            \
-    if (status != MACE_SUCCESS) {                                          \
-      VLOG(0) << "Mace runtime failure: " << __FILE__ << ":" << __LINE__;  \
+    if (status != MaceStatus::MACE_SUCCESS) {                              \
+      VLOG(0) << "Mace runtime failure: "                                  \
+              << __FILE__ << ":" << __LINE__ << ". "                       \
+              << status.information();                                     \
       return status;                                                       \
     }                                                                      \
   }
@@ -112,9 +137,9 @@ class MACE_API GPUContextBuilder {
   GPUContextBuilder();
   ~GPUContextBuilder();
   GPUContextBuilder(const GPUContextBuilder &) = delete;
-  GPUContextBuilder(const GPUContextBuilder &&) = delete;
+  GPUContextBuilder(GPUContextBuilder &&) = delete;
   GPUContextBuilder &operator=(const GPUContextBuilder &) = delete;
-  GPUContextBuilder &operator=(const GPUContextBuilder &&) = delete;
+  GPUContextBuilder &operator=(GPUContextBuilder &&) = delete;
 
   /// \brief Set internal storage factory to store internal data.
   ///
@@ -167,7 +192,7 @@ class MACE_API MaceEngineConfig {
   ///
   /// Just use one GPUContext for multiple models run on GPU.
   /// \param context created use GPUContextBuilder
-  /// \return MACE_SUCCESS for success, other for failed.
+  /// \return MaceStatus::MACE_SUCCESS for success, other for failed.
   MaceStatus SetGPUContext(std::shared_ptr<GPUContext> context);
 
   /// \brief Set GPU hints, currently only supports Adreno GPU.
@@ -177,7 +202,7 @@ class MACE_API MaceEngineConfig {
   ///
   /// \param perf_hint  performance hint
   /// \param priority_hint  priority hint
-  /// \return MACE_SUCCESS for success, other for failed.
+  /// \return MaceStatus::MACE_SUCCESS for success, other for failed.
   MaceStatus SetGPUHints(GPUPerfHint perf_hint,
                          GPUPriorityHint priority_hint);
 
@@ -199,7 +224,7 @@ class MACE_API MaceEngineConfig {
   /// detect big-LITTLE cores (see GetBigLittleCoreIDs). In such cases, it's
   /// suggested to use AFFINITY_NONE to use all cores.
   /// \param use_gemmlowp use gemmlowp for quantized inference
-  /// \return MACE_SUCCESS for success, other for failed.
+  /// \return MaceStatus::MACE_SUCCESS for success, other for failed.
   MaceStatus SetCPUThreadPolicy(int num_threads_hint,
                                 CPUAffinityPolicy policy,
                                 bool use_gemmlowp = false);
@@ -273,8 +298,9 @@ class MACE_API MaceEngine {
 /// \param output_nodes[in]: the array of output nodes' name
 /// \param config[in]: configurations for MaceEngine.
 /// \param engine[out]: output MaceEngine object
-/// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
-///         MACE_OUT_OF_RESOURCES for resources is out of range.
+/// \return MaceStatus::MACE_SUCCESS for success,
+///         MaceStatus::MACE_INVALID_ARGS for wrong arguments,
+///         MaceStatus::MACE_OUT_OF_RESOURCES for resources is out of range.
 MACE_API MaceStatus CreateMaceEngineFromProto(
     const std::vector<unsigned char> &model_pb,
     const std::string &model_data_file,
diff --git a/mace/python/tools/converter.py b/mace/python/tools/converter.py
index 4fee7dfa..b0992de0 100644
--- a/mace/python/tools/converter.py
+++ b/mace/python/tools/converter.py
@@ -199,6 +199,9 @@ def main(unused_args):
             for arg in cpu_graph_def.arg:
                 if arg.name not in output_graph_arg_names:
                     output_graph_def.arg.extend(arg)
+            for op_type in cpu_graph_def.op_types:
+                if op_type not in output_graph_def.op_types:
+                    output_graph_def.op_types.extend([op_type])
             print("Merge done")
         else:
             option.device = device_type_map[FLAGS.runtime]
diff --git a/mace/python/tools/converter_tool/base_converter.py b/mace/python/tools/converter_tool/base_converter.py
index 5e6c6f8e..3afd65c1 100644
--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -93,7 +93,6 @@ MaceSupportedOps = [
     'Dequantize',
     'Eltwise',
     'ExpandDims',
-    'FoldedBatchNorm',
     'Fill',
     'FullyConnected',
     'Gather',
diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py
index 5aabfa4b..374d1073 100644
--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
@@ -487,7 +487,7 @@ class CaffeConverter(base_converter.ConverterInterface):
 
     def convert_folded_batchnorm(self, caffe_op):
         op = self.convert_general_op(caffe_op)
-        op.type = MaceOp.FoldedBatchNorm.name
+        op.type = MaceOp.BatchNorm.name
 
         scale_op = None
         for consumer in self._caffe_net.get_consumers(caffe_op.layer.top[0]):
diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py
index 5320c804..e62affaf 100644
--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
@@ -37,7 +37,7 @@ class ShapeInference(object):
             MaceOp.Deconv2D.name: self.infer_shape_deconv,
             MaceOp.DepthwiseConv2d.name: self.infer_shape_conv_pool_shape,
             MaceOp.Eltwise.name: self.infer_shape_general,
-            MaceOp.FoldedBatchNorm.name: self.infer_shape_general,
+            MaceOp.BatchNorm.name: self.infer_shape_general,
             MaceOp.AddN.name: self.infer_shape_general,
             MaceOp.Activation.name: self.infer_shape_general,
             MaceOp.Pooling.name: self.infer_shape_conv_pool_shape,
diff --git a/mace/python/tools/converter_tool/tensorflow_converter.py b/mace/python/tools/converter_tool/tensorflow_converter.py
index 56f2c3a0..68e5ccb5 100644
--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -531,7 +531,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
 
     def convert_fused_batchnorm(self, tf_op):
         op = self.convert_general_op(tf_op)
-        op.type = MaceOp.FoldedBatchNorm.name
+        op.type = MaceOp.BatchNorm.name
 
         is_training = tf_op.get_attr(tf_is_training_str)
         assert is_training is False, 'Only support batch normalization ' \
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index dc7fe58e..7175e6e7 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -343,7 +343,7 @@ class Transformer(base_converter.ConverterInterface):
                         and consumer_op.input[1] in self._consts \
                         and len(self._consts[consumer_op.input[1]].dims) == 1:
                     print("Fold batchnorm: %s(%s)" % (op.name, op.type))
-                    consumer_op.type = MaceOp.FoldedBatchNorm.name
+                    consumer_op.type = MaceOp.BatchNorm.name
                     consumer_op.input[:] = [op.input[0], op.input[1],
                                             consumer_op.input[1]]
 
@@ -534,7 +534,7 @@ class Transformer(base_converter.ConverterInterface):
             if (op.type == MaceOp.Conv2D.name) \
                     and self.consumer_count(op.output[0]) == 1:
                 consumer_op = self._consumers[op.output[0]][0]
-                if consumer_op.type == MaceOp.FoldedBatchNorm.name:
+                if consumer_op.type == MaceOp.BatchNorm.name:
                     print("Fold conv and bn: %s(%s)" % (op.name, op.type))
                     filter = self._consts[op.input[1]]
                     scale = self._consts[consumer_op.input[1]]
@@ -574,7 +574,7 @@ class Transformer(base_converter.ConverterInterface):
             if (op.type == MaceOp.Deconv2D.name) \
                     and self.consumer_count(op.output[0]) == 1:
                 consumer_op = self._consumers[op.output[0]][0]
-                if consumer_op.type == MaceOp.FoldedBatchNorm.name:
+                if consumer_op.type == MaceOp.BatchNorm.name:
                     print("Fold deconv and bn: %s(%s)" % (op.name, op.type))
                     filter = self._consts[op.input[1]]
                     scale = self._consts[consumer_op.input[1]]
@@ -617,7 +617,7 @@ class Transformer(base_converter.ConverterInterface):
             if op.type == MaceOp.DepthwiseConv2d.name \
                     and self.consumer_count(op.output[0]) == 1:
                 consumer_op = self._consumers[op.output[0]][0]
-                if consumer_op.type == MaceOp.FoldedBatchNorm.name:
+                if consumer_op.type == MaceOp.BatchNorm.name:
                     print("Fold depthwise conv and bn: %s(%s)"
                           % (op.name, op.type))
                     filter = self._consts[op.input[1]]
@@ -977,7 +977,7 @@ class Transformer(base_converter.ConverterInterface):
                 or op.type == MaceOp.Deconv2D.name
                 or op.type == MaceOp.DepthwiseConv2d.name
                 or op.type == MaceOp.FullyConnected.name
-                or op.type == MaceOp.FoldedBatchNorm.name
+                or op.type == MaceOp.BatchNorm.name
                 or op.type == MaceOp.WinogradInverseTransform.name) \
                     and len(self._consumers.get(op.output[0], [])) == 1:
                 consumer_op = self._consumers[op.output[0]][0]
@@ -1433,7 +1433,7 @@ class Transformer(base_converter.ConverterInterface):
                 if op.input[1] in self._consts \
                         and len(self._consts[op.input[1]].dims) == 1:
                     self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
-            elif op.type == MaceOp.FoldedBatchNorm.name:
+            elif op.type == MaceOp.BatchNorm.name:
                 self.buffer_transform(op, 1, OpenCLBufferType.ARGUMENT)
                 self.buffer_transform(op, 2, OpenCLBufferType.ARGUMENT)
                 if len(op.input) >= 4:
@@ -1695,6 +1695,14 @@ class Transformer(base_converter.ConverterInterface):
 
             ConverterUtil.add_data_type_arg(op_def, mace_pb2.DT_FLOAT)
 
+    def add_op_types(self):
+        net = self._model
+        op_types = set()
+        for op in net.op:
+            op_types.add(op.type)
+        for op_type in op_types:
+            net.op_types.extend([op_type])
+
     def sort_by_execution(self):
         print("Sort by execution")
         net = self._model
@@ -1711,6 +1719,8 @@ class Transformer(base_converter.ConverterInterface):
         del net.op[:]
         net.op.extend(sorted_nodes)
 
+        self.add_op_types()
+
         print("Final ops:")
         for op in net.op:
             print("%s (%s): %s" % (op.name, op.type, [
diff --git a/mace/python/tools/mace_engine_factory.h.jinja2 b/mace/python/tools/mace_engine_factory.h.jinja2
index ab400151..3e183f15 100644
--- a/mace/python/tools/mace_engine_factory.h.jinja2
+++ b/mace/python/tools/mace_engine_factory.h.jinja2
@@ -60,7 +60,7 @@ std::map<std::string, int> model_name_map {
 /// \param output_nodes[in]: the array of output nodes' name
 /// \param config[in]: configurations for MaceEngine.
 /// \param engine[out]: output MaceEngine object
-/// \return MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
+/// \return MaceStatus::MACE_SUCCESS for success, MACE_INVALID_ARGS for wrong arguments,
 ///         MACE_OUT_OF_RESOURCES for resources is out of range.
 MaceStatus CreateMaceEngineFromCode(
     const std::string &model_name,
diff --git a/mace/python/tools/model.jinja2 b/mace/python/tools/model.jinja2
index 3f4ba1c4..ec1ba284 100644
--- a/mace/python/tools/model.jinja2
+++ b/mace/python/tools/model.jinja2
@@ -122,6 +122,12 @@ void CreateTensors(NetDef *net_def) {
   {% endfor %}
 }
 
+void CreateOpTypes(NetDef *net_def) {
+  {% for op_type in net.op_types %}
+  net_def->add_op_types({{ op_type|tojson }});
+  {% endfor %}
+}
+
 {% if net.mem_arena.mem_block|length != 0 %}
 void CreateMemoryArena(mace::MemoryArena *mem_arena) {
   mem_arena->mutable_mem_block()->Reserve({{ net.mem_arena.mem_block|length }});
@@ -162,6 +168,9 @@ const std::shared_ptr<NetDef> CreateNet() {
   {% if net.output_info | length > 0 %}
   CreateOutputInfo(net_def.get());
   {% endif %}
+  {% if net.op_types|length > 0 %}
+  CreateOpTypes(net_def.get());
+  {% endif %}
 
   return net_def;
 }
diff --git a/mace/test/mace_api_mt_test.cc b/mace/test/mace_api_mt_test.cc
index 7f768adc..0f8d1f49 100644
--- a/mace/test/mace_api_mt_test.cc
+++ b/mace/test/mace_api_mt_test.cc
@@ -15,7 +15,7 @@
 #include <fstream>
 #include <thread>  // NOLINT(build/c++11)
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 
@@ -332,6 +332,10 @@ void MaceRunFunc(const int in_out_size) {
     OutputInfo *info = net_def->add_output_info();
     info->set_name(output_names[i]);
   }
+  for (int i = 0; i < net_def->op_size(); ++i) {
+    net_def->add_op_types(net_def->op(i).type());
+  }
+
   MaceEngineConfig config(DeviceType::GPU);
 
   MaceEngine engine(config);
diff --git a/mace/test/mace_api_test.cc b/mace/test/mace_api_test.cc
index 945758b9..54dd99b7 100644
--- a/mace/test/mace_api_test.cc
+++ b/mace/test/mace_api_test.cc
@@ -15,7 +15,7 @@
 
 #include <fstream>
 
-#include "mace/core/operator.h"
+#include "mace/core/op_def_registry.h"
 #include "mace/kernels/conv_pool_2d_util.h"
 #include "mace/ops/ops_test_util.h"
 #include "mace/public/mace.h"
@@ -334,6 +334,10 @@ void MaceRun(const int in_out_size,
     info->set_name(output_names[i]);
   }
 
+  for (int i = 0; i < net_def->op_size(); ++i) {
+    net_def->add_op_types(net_def->op(i).type());
+  }
+
   MaceEngineConfig config(DeviceType::GPU);
 
   MaceEngine engine(config);
diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc
index 79b4c571..08ffdebe 100644
--- a/mace/tools/validation/mace_run.cc
+++ b/mace/tools/validation/mace_run.cc
@@ -215,7 +215,7 @@ bool RunModel(const std::string &model_name,
           FLAGS_omp_num_threads,
           static_cast<CPUAffinityPolicy >(FLAGS_cpu_affinity_policy),
           true);
-  if (status != MACE_SUCCESS) {
+  if (status != MaceStatus::MACE_SUCCESS) {
     LOG(WARNING) << "Set openmp or cpu affinity failed.";
   }
 #ifdef MACE_ENABLE_OPENCL
@@ -274,9 +274,9 @@ bool RunModel(const std::string &model_name,
 #endif
     int64_t t1 = NowMicros();
 
-    if (create_engine_status != MACE_SUCCESS) {
+    if (create_engine_status != MaceStatus::MACE_SUCCESS) {
       LOG(ERROR) << "Create engine runtime error, retry ... errcode: "
-                 << create_engine_status;
+                 << create_engine_status.information();
     } else {
       init_millis = (t1 - t0) / 1000.0;
       LOG(INFO) << "Total init latency: " << init_millis << " ms";
@@ -324,9 +324,9 @@ bool RunModel(const std::string &model_name,
   while (true) {
     int64_t t3 = NowMicros();
     MaceStatus warmup_status = engine->Run(inputs, &outputs);
-    if (warmup_status != MACE_SUCCESS) {
+    if (warmup_status != MaceStatus::MACE_SUCCESS) {
       LOG(ERROR) << "Warmup runtime error, retry ... errcode: "
-                 << warmup_status;
+                 << warmup_status.information();
       do {
 #ifdef MODEL_GRAPH_FORMAT_CODE
         create_engine_status =
@@ -345,7 +345,7 @@ bool RunModel(const std::string &model_name,
                                       config,
                                       &engine);
 #endif
-      } while (create_engine_status != MACE_SUCCESS);
+      } while (create_engine_status != MaceStatus::MACE_SUCCESS);
     } else {
       int64_t t4 = NowMicros();
       warmup_millis = (t4 - t3) / 1000.0;
@@ -364,9 +364,9 @@ bool RunModel(const std::string &model_name,
       while (true) {
         int64_t t0 = NowMicros();
         run_status = engine->Run(inputs, &outputs);
-        if (run_status != MACE_SUCCESS) {
+        if (run_status != MaceStatus::MACE_SUCCESS) {
           LOG(ERROR) << "Mace run model runtime error, retry ... errcode: "
-                     << run_status;
+                     << run_status.information();
           do {
 #ifdef MODEL_GRAPH_FORMAT_CODE
             create_engine_status =
@@ -385,7 +385,7 @@ bool RunModel(const std::string &model_name,
                                           config,
                                           &engine);
 #endif
-          } while (create_engine_status != MACE_SUCCESS);
+          } while (create_engine_status != MaceStatus::MACE_SUCCESS);
         } else {
           int64_t t1 = NowMicros();
           total_run_duration += (t1 - t0);
diff --git a/mace/utils/BUILD b/mace/utils/BUILD
index 283efa49..6d6feb1a 100644
--- a/mace/utils/BUILD
+++ b/mace/utils/BUILD
@@ -13,6 +13,7 @@ cc_library(
     name = "utils",
     srcs = [
         "logging.cc",
+        "status.cc",
         "string_util.cc",
     ],
     hdrs = glob([
diff --git a/mace/utils/status.cc b/mace/utils/status.cc
new file mode 100644
index 00000000..fd8dd9da
--- /dev/null
+++ b/mace/utils/status.cc
@@ -0,0 +1,88 @@
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/public/mace.h"
+
+namespace mace {
+
+class MaceStatus::Impl {
+ public:
+  explicit Impl(const Code code): code_(code), information_("") {}
+  Impl(const Code code, const std::string &informaton)
+      : code_(code), information_(informaton) {}
+  ~Impl() = default;
+
+  void SetCode(const Code code) { code_ = code; }
+  Code code() const { return code_; }
+  void SetInformation(const std::string &info) { information_ = info; }
+  std::string information() const { return Code2Str() + ": " + information_; }
+
+ private:
+  std::string Code2Str() const {
+    switch (code_) {
+      case MaceStatus::MACE_SUCCESS:
+        return "Success";
+      case MaceStatus::MACE_INVALID_ARGS:
+        return "Invalid Arguments";
+      case MaceStatus::MACE_OUT_OF_RESOURCES:
+        return "Out of resources";
+      default:
+        return "";
+    }
+  }
+
+ private:
+  MaceStatus::Code code_;
+  std::string information_;
+};
+
+MaceStatus::MaceStatus()
+    : impl_(new MaceStatus::Impl(MaceStatus::MACE_SUCCESS)) {}
+MaceStatus::MaceStatus(const Code code) : impl_(new MaceStatus::Impl(code)) {}
+MaceStatus::MaceStatus(const Code code, const std::string &information)
+    : impl_(new MaceStatus::Impl(code, information)) {}
+MaceStatus::MaceStatus(const MaceStatus &other)
+    : impl_(new MaceStatus::Impl(other.code(), other.information())) {}
+MaceStatus::MaceStatus(MaceStatus &&other)
+    : impl_(new MaceStatus::Impl(other.code(), other.information())) {}
+MaceStatus::~MaceStatus() = default;
+
+MaceStatus& MaceStatus::operator=(const MaceStatus &other) {
+  impl_->SetCode(other.code());
+  impl_->SetInformation(other.information());
+  return *this;
+}
+MaceStatus& MaceStatus::operator=(const MaceStatus &&other) {
+  impl_->SetCode(other.code());
+  impl_->SetInformation(other.information());
+  return *this;
+}
+
+MaceStatus::Code MaceStatus::code() const {
+  return impl_->code();
+}
+
+std::string MaceStatus::information() const {
+  return impl_->information();
+}
+
+bool MaceStatus::operator==(const MaceStatus &other) const {
+  return other.code() == impl_->code();
+}
+
+bool MaceStatus::operator!=(const MaceStatus &other) const {
+  return other.code() != impl_->code();
+}
+
+}  // namespace mace
diff --git a/mace/utils/utils.h b/mace/utils/utils.h
index 12138cad..237febcc 100644
--- a/mace/utils/utils.h
+++ b/mace/utils/utils.h
@@ -33,8 +33,8 @@ namespace mace {
   CLASSNAME &operator=(const CLASSNAME &) = delete
 #endif
 
-#ifndef MACE_VIRTUAL_EMPTY_DESTRUCTOR
-#define MACE_VIRTUAL_EMPTY_DESTRUCTOR(CLASSNAME) \
+#ifndef MACE_EMPTY_VIRTUAL_DESTRUCTOR
+#define MACE_EMPTY_VIRTUAL_DESTRUCTOR(CLASSNAME) \
  public:                                         \
   virtual ~CLASSNAME() {}
 #endif
-- 
GitLab