diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 826f424bfde34a85657bfb55f49b46ce9b2a0822..c670d9aa729dc575a204eacf3789fd56675df4a9 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -83,6 +83,7 @@ extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry);
 extern void Register_Reshape(OperatorRegistry *op_registry);
 extern void Register_Eltwise(OperatorRegistry *op_registry);
 extern void Register_FullyConnected(OperatorRegistry *op_registry);
+extern void Register_Slice(OperatorRegistry *op_registry);
 
 OperatorRegistry::OperatorRegistry() {
   Register_Activation(this);
@@ -109,6 +110,7 @@ OperatorRegistry::OperatorRegistry() {
   Register_Reshape(this);
   Register_Eltwise(this);
   Register_FullyConnected(this);
+  Register_Slice(this);
 }
 
 }  // namespace mace
diff --git a/mace/kernels/opencl/cl/slice.cl b/mace/kernels/opencl/cl/slice.cl
new file mode 100644
index 0000000000000000000000000000000000000000..d1d4a55427e6fd0a969a406a0dfc1603ea305e82
--- /dev/null
+++ b/mace/kernels/opencl/cl/slice.cl
@@ -0,0 +1,13 @@
+#include <common.h>
+
+__kernel void slice(__read_only image2d_t input,
+                    __private const int chan_blk_offset,
+                    __write_only image2d_t output) {
+  const int chan_blk_idx = get_global_id(0);
+  const int width_idx = get_global_id(1);
+  const int width = get_global_size(1);
+  const int hb_idx = get_global_id(2);
+  DATA_TYPE4 data = READ_IMAGET(input, SAMPLER,
+                                (int2)(mad24(chan_blk_idx + chan_blk_offset, width, width_idx), hb_idx));
+  WRITE_IMAGET(output, (int2)(mad24(chan_blk_idx, width, width_idx), hb_idx), data);
+}
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 9cd508bdfde411d2a55c97712d00c2066b0f82c6..119ec7cd61a99f915ff7ec29443839ea2923d3a4 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -72,8 +72,6 @@ static void ConcatN(cl::Kernel *kernel,
   const index_t width = output->dim(2);
   const index_t channel = output->dim(3);
 
-  const int channel_blk = RoundUpDiv4(channel);
-
   if (kernel->get() == nullptr) {
     auto runtime = OpenCLRuntime::Global();
     std::set<std::string> built_options;
diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15201f13253c9f2636f54466c4ae91c65cc6f7b3
--- /dev/null
+++ b/mace/kernels/opencl/slice.cc
@@ -0,0 +1,70 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/slice.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"
+
+namespace mace {
+namespace kernels {
+
+template<typename T>
+void SliceFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                     const std::vector<Tensor *> &output_list,
+                                                     StatsFuture *future) {
+  const index_t input_channels = input->dim(3);
+  const size_t outputs_count = output_list.size();
+  const index_t output_channels = input_channels / outputs_count;
+  MACE_CHECK(output_channels % 4 == 0) << "output channels of slice op must be divisible by 4";
+  std::vector<index_t> output_shape({input->dim(0), input->dim(1),
+                                     input->dim(2), output_channels});
+
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT_CHANNEL, image_shape);
+  for (size_t i= 0; i < outputs_count; ++i) {
+    output_list[i]->ResizeImage(output_shape, image_shape);
+  }
+
+  if (kernel_.get() == nullptr) {
+    auto runtime = OpenCLRuntime::Global();
+    std::set<std::string> built_options;
+    std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice");
+    built_options.emplace("-Dslice=" + kernel_name);
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum<T>::value));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum<T>::value));
+    kernel_ = runtime->BuildKernel("slice", kernel_name, built_options);
+  }
+  const index_t channel_blk = RoundUpDiv4(output_channels);
+
+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk),
+      static_cast<uint32_t>(input->dim(2)),
+      static_cast<uint32_t>(input->dim(0) * input->dim(1)),
+  };
+  const std::vector<uint32_t> lws = {8, 16, 8, 1};
+  std::stringstream ss;
+  ss << "slice_opencl_kernel_"
+     << input->dim(0) << "_"
+     << input->dim(1) << "_"
+     << input->dim(2) << "_"
+     << input_channels << "_"
+     << outputs_count;
+  for (int i = 0; i < outputs_count; ++i) {
+    uint32_t idx = 0;
+    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->opencl_image())));
+    kernel_.setArg(idx++, static_cast<int32_t>(channel_blk * i));
+    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output_list[i]->opencl_image())));
+
+    TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future);
+  }
+}
+
+template
+struct SliceFunctor<DeviceType::OPENCL, float>;
+template
+struct SliceFunctor<DeviceType::OPENCL, half>;
+
+}  // namespace kernels
+}  // namespace mace
diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..619e28ab2d7f03aae196e1328b4fc195a4bc505c
--- /dev/null
+++ b/mace/kernels/slice.h
@@ -0,0 +1,69 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_SLICE_H_
+#define MACE_KERNELS_SLICE_H_
+
+#include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/core/tensor.h"
+#include "mace/core/types.h"
+#include "mace/public/mace.h"
+
+namespace mace {
+namespace kernels {
+
+template<DeviceType D, typename T>
+struct SliceFunctor {
+
+  void operator()(const Tensor *input,
+                  const std::vector<Tensor *> &output_list,
+                  StatsFuture *future) {
+    const index_t outer_size = input->dim(0) * input->dim(1) * input->dim(2);
+    const index_t input_channels = input->dim(3);
+    const size_t outputs_count = output_list.size();
+    const index_t output_channels = input_channels / outputs_count;
+    std::vector<T *> output_ptrs(output_list.size(), nullptr);
+
+    std::vector<index_t> output_shape({input->dim(0), input->dim(1),
+                                       input->dim(2), output_channels});
+
+    for (size_t i= 0; i < outputs_count; ++i) {
+      output_list[i]->Resize(output_shape);
+      output_ptrs[i] = output_list[i]->mutable_data<T>();
+    }
+    const T *input_ptr = input->data<T>();
+
+#pragma omp parallel for
+    for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+      int input_idx = outer_idx * input_channels;
+      int output_idx = outer_idx * output_channels;
+      for (size_t i = 0; i < outputs_count; ++i) {
+        if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+          memcpy(output_ptrs[i]+output_idx, input_ptr+input_idx, output_channels * sizeof(T));
+        } else {
+          for (index_t k = 0; k < output_channels; ++k) {
+            *(output_ptrs[i] + output_idx + k) = *(input_ptr + input_idx + k);
+          }
+        }
+        input_idx += output_channels;
+      }
+    }
+  }
+};
+
+template<typename T>
+struct SliceFunctor<DeviceType::OPENCL, T> {
+
+  void operator()(const Tensor *input,
+                  const std::vector<Tensor *> &output_list,
+                  StatsFuture *future);
+  cl::Kernel kernel_;
+
+};
+
+}  // namepsace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_SLICE_H_
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index 71be2fc3eb1100c36a23907aeccbec41c2dba899..361fce51cf0ce7ecfb60da65e5b17791a6c4067d 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -12,11 +12,6 @@ void Register_Concat(OperatorRegistry *op_registry) {
                                      .TypeConstraint<float>("T")
                                      .Build(),
                     ConcatOp<DeviceType::CPU, float>);
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
-                                     .Device(DeviceType::CPU)
-                                     .TypeConstraint<half>("T")
-                                     .Build(),
-                    ConcatOp<DeviceType::CPU, half>);
 
   REGISTER_OPERATOR(op_registry, OpKeyBuilder("Concat")
                                      .Device(DeviceType::OPENCL)
diff --git a/mace/ops/slice.cc b/mace/ops/slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6de3da403fca90031c76597c31126f742bf8ba5f
--- /dev/null
+++ b/mace/ops/slice.cc
@@ -0,0 +1,28 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/slice.h"
+
+namespace mace {
+
+void Register_Slice(OperatorRegistry *op_registry) {
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
+                                     .Device(DeviceType::CPU)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    SliceOp<DeviceType::CPU, float>);
+
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
+                                     .Device(DeviceType::OPENCL)
+                                     .TypeConstraint<float>("T")
+                                     .Build(),
+                    SliceOp<DeviceType::OPENCL, float>);
+  REGISTER_OPERATOR(op_registry, OpKeyBuilder("Slice")
+                                     .Device(DeviceType::OPENCL)
+                                     .TypeConstraint<half>("T")
+                                     .Build(),
+                    SliceOp<DeviceType::OPENCL, half>);
+}
+
+}  // namespace mace
diff --git a/mace/ops/slice.h b/mace/ops/slice.h
new file mode 100644
index 0000000000000000000000000000000000000000..41106a6339bc21ff9ce4b176bb02cdb12b1299c8
--- /dev/null
+++ b/mace/ops/slice.h
@@ -0,0 +1,37 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_SLICE_H_
+#define MACE_OPS_SLICE_H_
+
+#include "mace/core/operator.h"
+#include "mace/kernels/slice.h"
+namespace mace {
+
+template <DeviceType D, typename T>
+class SliceOp : public Operator<D, T> {
+ public:
+  SliceOp(const OperatorDef &op_def, Workspace *ws)
+      : Operator<D, T>(op_def, ws) {}
+
+  bool Run(StatsFuture *future) override {
+    MACE_CHECK(this->OutputSize() >= 2) << "There must be at least two outputs for slicing";
+    const Tensor *input = this->Input(INPUT);
+    const std::vector<Tensor *> output_list = this->Outputs();
+    MACE_CHECK((input->dim(3) % this->OutputSize()) == 0) << "Outputs do not split input equally.";
+
+    functor_(input, output_list, future);
+    return true;
+  }
+
+ private:
+  kernels::SliceFunctor<D, T> functor_;
+
+ private:
+  OP_INPUT_TAGS(INPUT);
+};
+
+}  // namespace mace
+
+#endif  // MACE_OPS_SLICE_H_
diff --git a/mace/ops/slice_benchmark.cc b/mace/ops/slice_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5273f592c71084434b265affd4ef5760cf6aef37
--- /dev/null
+++ b/mace/ops/slice_benchmark.cc
@@ -0,0 +1,79 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+template<DeviceType D, typename T>
+static void BMSliceHelper(int iters,
+                          const std::vector<index_t> &input_shape,
+                          const index_t num_outputs) {
+  mace::testing::StopTiming();
+
+  // Construct graph
+  OpsTestNet net;
+
+  const index_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<index_t>());
+  std::vector<float> input_data(input_size);
+  GenerateRandomRealTypeData(input_shape, input_data);
+  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+
+    auto builder = OpDefBuilder("Slice", "SliceTest");
+    builder.Input("InputImage");
+    for (int i = 0; i < num_outputs; ++i) {
+      builder = builder.Output(MakeString("OutputImage", i));
+    }
+    builder
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+  } else {
+    auto builder = OpDefBuilder("Slice", "SliceTest");
+    builder.Input("Input");
+    for (int i = 0; i < num_outputs; ++i) {
+      builder = builder.Output(MakeString("Output", i));
+    }
+    builder.Finalize(net.NewOperatorDef());
+  }
+
+  // Warm-up
+  for (int i = 0; i < 2; ++i) {
+    net.RunOp(D);
+    net.Sync();
+  }
+
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+    net.Sync();
+  }
+}
+
+#define BM_SLICE_MACRO(N, H, W, C, NO, TYPE, DEVICE) \
+  static void BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE(int iters) { \
+    const int64_t tot = static_cast<int64_t>(iters) * N * H * W * C;                 \
+    mace::testing::MaccProcessed(tot);                                               \
+    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));                              \
+    BMSliceHelper<DEVICE, TYPE>(iters, {N, H, W, C}, NO);                            \
+  }                                                                                  \
+  BENCHMARK(BM_SLICE_##N##_##H##_##W##_##C##_##NO##_##TYPE##_##DEVICE)
+
+#define BM_SLICE(N, H, W, C, NO)                 \
+  BM_SLICE_MACRO(N, H, W, C, NO, float, CPU);    \
+  BM_SLICE_MACRO(N, H, W, C, NO, float, OPENCL); \
+  BM_SLICE_MACRO(N, H, W, C, NO, half, OPENCL);
+
+BM_SLICE(1, 32, 32, 32, 2);
+BM_SLICE(1, 32, 32, 128, 2);
+BM_SLICE(1, 32, 32, 256, 2);
+BM_SLICE(1, 128, 128, 32, 2);
+BM_SLICE(1, 128, 128, 128, 2);
+
+
+}  // namespace mace
diff --git a/mace/ops/slice_test.cc b/mace/ops/slice_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff65fffa78ff2e8507944e53e100fbc6b31df506
--- /dev/null
+++ b/mace/ops/slice_test.cc
@@ -0,0 +1,99 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/slice.h"
+#include "mace/ops/ops_test_util.h"
+#include "gmock/gmock.h"
+
+using namespace mace;
+
+class SliceOpTest : public OpsTestBase {};
+
+template<DeviceType D, typename T>
+void RandomTest(const int num_outputs) {
+  srand(time(nullptr));
+  const index_t output_channels = 4 * (1 + rand() % 10);
+  const index_t input_channels = num_outputs * output_channels;
+  const index_t batch = 3 + (rand() % 10);
+  const index_t height = 13 + (rand() % 10);
+  const index_t width = 17 + (rand() % 10);
+
+  // Construct graph
+  OpsTestNet net;
+
+  std::vector<index_t> input_shape({batch, height, width, input_channels});
+  const index_t input_size = std::accumulate(input_shape.begin(), input_shape.end(), 1, std::multiplies<index_t>());
+  std::vector<float> input_data(input_size);
+  GenerateRandomRealTypeData(input_shape, input_data);
+  net.AddInputFromArray<D, float>("Input", input_shape, input_data);
+
+  if (D == DeviceType::OPENCL) {
+    BufferToImage<D, T>(net, "Input", "InputImage",
+                        kernels::BufferType::IN_OUT_CHANNEL);
+
+    auto builder = OpDefBuilder("Slice", "SliceTest");
+    builder.Input("InputImage");
+    for (int i = 0; i < num_outputs; ++i) {
+      builder = builder.Output(MakeString("OutputImage", i));
+    }
+    builder
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .Finalize(net.NewOperatorDef());
+  } else {
+    auto builder = OpDefBuilder("Slice", "SliceTest");
+    builder.Input("Input");
+    for (int i = 0; i < num_outputs; ++i) {
+      builder = builder.Output(MakeString("Output", i));
+    }
+    builder.Finalize(net.NewOperatorDef());
+
+  }
+
+  // Run
+  net.RunOp(D);
+
+  if (D == DeviceType::OPENCL) {
+    for (int i = 0; i < num_outputs; ++i) {
+      ImageToBuffer<D, float>(net, MakeString("OutputImage", i), MakeString("Output", i),
+                              kernels::BufferType::IN_OUT_CHANNEL);
+    }
+  }
+
+  // Check
+  std::vector<index_t> expected_shape({batch, height, width, output_channels});
+  const index_t outer_size = std::accumulate(expected_shape.begin(), expected_shape.end() - 1,
+                                             1, std::multiplies<index_t>());
+  const float *input_ptr = input_data.data();
+  const float *output_ptr;
+  for (int i = 0; i < num_outputs; ++i) {
+    auto output = net.GetOutput(MakeString("Output", i).c_str());
+    EXPECT_THAT(output->shape(), ::testing::ContainerEq(expected_shape));
+    Tensor::MappingGuard output_mapper(output);
+    output_ptr = output->data<float>();
+    for (int outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+      const int idx = outer_idx * input_channels + i * output_channels;
+      for (int j = 0; j < output_channels; ++j) {
+        ASSERT_NEAR(*output_ptr++, input_ptr[idx + j], 1e-2) << "with output " << i << " index " << idx + j;
+      }
+    }
+  }
+}
+
+TEST_F(SliceOpTest, CPU) {
+  RandomTest<DeviceType::CPU, float>(2);
+  RandomTest<DeviceType::CPU, float>(4);
+  RandomTest<DeviceType::CPU, float>(11);
+}
+
+TEST_F(SliceOpTest, OPENCLFloat) {
+  RandomTest<DeviceType::OPENCL, float>(2);
+  RandomTest<DeviceType::OPENCL, float>(4);
+  RandomTest<DeviceType::OPENCL, float>(11);
+}
+
+TEST_F(SliceOpTest, OPENCLHalf) {
+  RandomTest<DeviceType::OPENCL, half>(2);
+  RandomTest<DeviceType::OPENCL, half>(4);
+  RandomTest<DeviceType::OPENCL, half>(11);
+}