diff --git a/mace/core/net.cc b/mace/core/net.cc
index 7912a6d4209808c25b7b33b47806f3eedf81112b..2aeb951e92fe8a44cb814caf6ab13eaf5c6bae7c 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -14,6 +14,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <set>
 #include <unordered_set>
 #include <utility>
 
@@ -63,18 +64,85 @@ bool TransformRequiredOp(const std::string &op_type) {
 }
 #endif  // MACE_ENABLE_OPENCL
 
+
+
+// TODO(lichao): Move to runtime driver class after universality done.
+// fallback to gpu buffer when kernels are implemented
+void FindAvailableDevicesForOp(const OpRegistryBase &op_registry,
+                               const OperatorDef &op,
+                               const std::unordered_map<std::string,
+                                   std::vector<index_t>> &tensor_shape_info,
+                               std::set<DeviceType>
+                                   *available_devices) {
+  auto devices = op_registry.AvailableDevices(op.type());
+  available_devices->insert(devices.begin(), devices.end());
+  std::string op_type = op.type();
+  // For those whose shape is not 4-rank but can run on GPU
+  if (op_type == "BufferTransform"
+      || op_type == "LSTMCell"
+      || op_type == "FullyConnected"
+      || op_type == "Softmax"
+      || op_type == "Squeeze") {
+    return;
+  } else {
+    if (op.output_shape_size() != op.output_size()) {
+      return;
+    }
+    if (op.output_shape(0).dims_size() != 4) {
+      available_devices->erase(DeviceType::GPU);
+    }
+
+    if (op_type == "Split") {
+      if (op.output_shape(0).dims_size() != 4
+          || op.output_shape(0).dims()[3] % 4 != 0) {
+        available_devices->erase(DeviceType::GPU);
+      }
+    } else if (op_type == "Concat") {
+      if (op.output_shape(0).dims_size() != 4) {
+        available_devices->erase(DeviceType::GPU);
+      } else {
+        if (op.input_size() != 2) {
+          for (const std::string &input : op.input()) {
+            if (tensor_shape_info.find(input) != tensor_shape_info.end()) {
+              auto &input_shape = tensor_shape_info.at(input);
+              if (input_shape[3] % 4 != 0) {
+                available_devices->erase(DeviceType::GPU);
+                break;
+              }
+            }
+          }
+        }
+      }
+    } else if (op_type == "ChannelShuffle") {
+      int groups = ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+          op, "group", 1);
+      int channels = op.output_shape(0).dims(3);
+      int channels_per_group = channels / groups;
+      if (groups % 4 != 0 || channels_per_group % 4 != 0) {
+        available_devices->erase(DeviceType::GPU);
+      }
+    }
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<Operation> SerialNet::CreateOperation(
     const OpRegistryBase *op_registry,
     OpConstructContext *construct_context,
     std::shared_ptr<OperatorDef> op_def,
+    const std::unordered_map<std::string,
+                             std::vector<index_t>> tensor_shape_info,
     DataFormat data_format_flag,
     bool is_quantize_model) {
   // Create the Operation
   DeviceType target_device_type = target_device_->device_type();
   // Get available devices
-  auto available_devices = op_registry->AvailableDevices(op_def->type());
+  std::set<DeviceType> available_devices;
+  FindAvailableDevicesForOp(*op_registry,
+                            *op_def,
+                            tensor_shape_info,
+                            &available_devices);
   // Find the device type to run the op.
   // If the target_device_type in available devices, use target_device_type,
   // otherwise, fallback to CPU device.
@@ -93,6 +161,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
     }
   }
   op_def->set_device_type(device_type);
+
   // transpose output shape if run on CPU (default format is NHWC)
   if (!is_quantize_model && device_type == DeviceType::CPU &&
       op_def->output_shape_size() == op_def->output_size()) {
@@ -139,7 +208,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
   MemoryType target_mem_type;
   // quantize model flag
   bool is_quantize_model = IsQuantizedModel(*net_def);
-  //
+
   DataFormat data_format_flag = NHWC;
   if (target_device_->device_type() == DeviceType::CPU) {
     target_mem_type = MemoryType::CPU_BUFFER;
@@ -163,6 +232,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
           target_mem_type, DataType::DT_FLOAT, input_shape, -1));
     }
   }
+
 #ifdef MACE_ENABLE_OPENCL
   else {  // GPU  NOLINT[readability/braces]
     target_mem_type = MemoryType::GPU_BUFFER;
@@ -176,6 +246,22 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
   }
 #endif  // MACE_ENABLE_OPENCL
 
+  std::unordered_map<std::string, std::vector<index_t>> tensor_shape_info;
+  for (auto &op : net_def->op()) {
+    if (op.output_size() != op.output_shape_size()) {
+      continue;
+    }
+    for (int i = 0; i < op.output_size(); ++i) {
+      tensor_shape_info[op.output(i)] =
+          std::move(std::vector<index_t>(op.output_shape(i).dims().begin(),
+                                         op.output_shape(i).dims().end()));
+    }
+  }
+  for (auto &tensor : net_def->tensors()) {
+    tensor_shape_info[tensor.name()] =
+        std::move(std::vector<index_t>(tensor.dims().begin(),
+                                       tensor.dims().end()));
+  }
   OpConstructContext construct_context(ws_);
   for (int idx = 0; idx < net_def->op_size(); ++idx) {
     std::shared_ptr<OperatorDef> op_def(new OperatorDef(net_def->op(idx)));
@@ -183,6 +269,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
     auto op = CreateOperation(op_registry,
                               &construct_context,
                               op_def,
+                              tensor_shape_info,
                               data_format_flag,
                               is_quantize_model);
 #ifdef MACE_ENABLE_OPENCL
@@ -211,7 +298,8 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
               auto &output_info = output_map.at(op_def->input(i));
               // check whether the tensor has been transformed
               if (transformed_set.count(t_input_name) == 0) {
-                VLOG(1) << "Add Transform operation to transform tensor '"
+                VLOG(1) << "Add Transform operation " << op_def->name()
+                        << " to transform tensor "
                         << op_def->input(i) << "', from memory type "
                         << output_info.mem_type << " to "
                         << wanted_in_mem_type
@@ -234,6 +322,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
                     op_registry,
                     &construct_context,
                     transform_op_def,
+                    tensor_shape_info,
                     data_format_flag);
                 operators_.emplace_back(std::move(transform_op));
                 transformed_set.insert(t_input_name);
@@ -321,6 +410,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
             op_registry,
             &construct_context,
             transform_op_def,
+            tensor_shape_info,
             output_data_format);
         operators_.emplace_back(std::move(transform_op));
         // where to do graph reference count.
diff --git a/mace/core/net.h b/mace/core/net.h
index 10577a572f5a0629ae515d9b330befbaa639016e..5362d9ee4b8630a894cbb5705d6503bce2ed85f2 100644
--- a/mace/core/net.h
+++ b/mace/core/net.h
@@ -59,6 +59,8 @@ class SerialNet : public NetBase {
       const OpRegistryBase *op_registry,
       OpConstructContext *construct_context,
       std::shared_ptr<OperatorDef> op_def,
+      const std::unordered_map<std::string,
+                               std::vector<index_t>> tensor_shape_info,
       DataFormat input_format,
       bool is_quantize_model = false);
 
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 43950a9db00a76ba84b35cb519c4c0c30ded6263..40ab5839ea41d29b610f4872df63bfd47c0d644f 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -131,7 +131,8 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
         }
 
         std::unique_ptr<Tensor> tensor(
-            new Tensor(device->allocator(), dst_data_type, true));
+            new Tensor(device->allocator(), dst_data_type, true,
+                       const_tensor.name()));
         tensor->Resize(dims);
 
         MACE_CHECK(tensor->size() == const_tensor.data_size(),
@@ -328,26 +329,52 @@ void Workspace::RemoveUnusedBuffer() {
 void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
                                       const unsigned char *model_data,
                                       Allocator *alloc) {
+  std::unordered_set<std::string> tensor_to_host;
+  for (auto &op : net_def.op()) {
+    if (op.device_type() == DeviceType::CPU) {
+      for (std::string input : op.input()) {
+        tensor_to_host.insert(input);
+      }
+    }
+  }
   for (auto &const_tensor : net_def.tensors()) {
     auto iter = tensor_map_.find(const_tensor.name());
     if (iter->second->unused()) {
       tensor_map_.erase(iter);
-    } else if (!diffused_buffer_) {
-      tensor_map_.erase(iter);
+    } else {
       std::vector<index_t> dims;
       for (const index_t d : const_tensor.dims()) {
         dims.push_back(d);
       }
-      std::unique_ptr<Tensor> tensor(
-          new Tensor(alloc, const_tensor.data_type()));
-      tensor->Resize(dims);
-      MACE_CHECK(tensor->size() == const_tensor.data_size(),
-                 "Tensor's data_size not equal with the shape");
-      tensor->CopyBytes(model_data + const_tensor.offset(),
-                        const_tensor.data_size() *
-                            GetEnumTypeSize(const_tensor.data_type()));
-
-      tensor_map_[const_tensor.name()] = std::move(tensor);
+
+      if (tensor_to_host.find(const_tensor.name()) != tensor_to_host.end()
+          && const_tensor.data_type() == DataType::DT_HALF) {
+        std::unique_ptr<Tensor> tensor(
+            new Tensor(alloc, DataType::DT_FLOAT,
+                       true, const_tensor.name()));
+        tensor->Resize(dims);
+        MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                   "Tensor's data_size not equal with the shape");
+        Tensor::MappingGuard guard(tensor.get());
+        float *dst_data = tensor->mutable_data<float>();
+        const half *org_data = reinterpret_cast<const half *>(
+            model_data + const_tensor.offset());
+        for (index_t i = 0; i < const_tensor.data_size(); ++i) {
+          dst_data[i] = half_float::half_cast<float>(org_data[i]);
+        }
+        tensor_map_[const_tensor.name()] = std::move(tensor);
+      } else if (!diffused_buffer_) {
+        std::unique_ptr<Tensor> tensor(
+            new Tensor(alloc, const_tensor.data_type(),
+                       true, const_tensor.name()));
+        tensor->Resize(dims);
+        MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                   "Tensor's data_size not equal with the shape");
+        tensor->CopyBytes(model_data + const_tensor.offset(),
+                          const_tensor.data_size() *
+                              GetEnumTypeSize(const_tensor.data_type()));
+        tensor_map_[const_tensor.name()] = std::move(tensor);
+      }
     }
   }
   tensor_buffer_.reset(nullptr);
diff --git a/mace/libmace/mace.cc b/mace/libmace/mace.cc
index 42959ca877e42002fb567138fb528ca11c086e6f..047cdf8e9e8d68d7254eb917bbfda3513db6015c 100644
--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -482,14 +482,14 @@ MaceStatus MaceEngine::Impl::Init(
     MACE_RETURN_IF_ERROR(ws_->PreallocateOutputTensor(*net_def,
                                                       &mem_optimizer,
                                                       device_.get()));
-
+    if (device_type_ == DeviceType::GPU) {
+      ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
+    }
     MACE_RETURN_IF_ERROR(net_->Init());
 #ifdef MACE_ENABLE_HEXAGON
   }
 #endif
-  if (device_type_ == DeviceType::GPU) {
-    ws_->RemoveAndReloadBuffer(*net_def, model_data, device_->allocator());
-  }
+
   return MaceStatus::MACE_SUCCESS;
 }
 
diff --git a/mace/ops/channel_shuffle.cc b/mace/ops/channel_shuffle.cc
index d4404c618d0a06c75892782fab7bcd48866e5ebc..8301ccb54681bcf5fc1e521ec603ede8fc2d205f 100644
--- a/mace/ops/channel_shuffle.cc
+++ b/mace/ops/channel_shuffle.cc
@@ -58,14 +58,12 @@ class ChannelShuffleOp<DeviceType::CPU, T> : public Operation {
 #pragma omp parallel for collapse(2) schedule(runtime)
     for (index_t b = 0; b < batch; ++b) {
       for (index_t c = 0; c < channels; ++c) {
-        const T *input_base = input_ptr + b * batch_size;
-        T *output_base = output_ptr + b * batch_size;
         index_t g = c % groups_;
         index_t idx = c / groups_;
-        for (index_t hw = 0; hw < height * width; ++hw) {
-          output_base[c * image_size + hw] = input_base[
-              (g * channels_per_group + idx) * image_size + hw];
-        }
+        const T *in_ptr = input_ptr + b * batch_size
+            + (g * channels_per_group + idx) * image_size;
+        T *out_ptr = output_ptr + b * batch_size + c * image_size;
+        memcpy(out_ptr, in_ptr, image_size * sizeof(float));
       }
     }