diff --git a/mace/core/net.cc b/mace/core/net.cc
index 279724f6e791623923e8772b5db88a4bb8293413..1732cfe1a36f04b9fed6c378e67b4637554113ae 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <utility>
 #include <algorithm>
 #include <limits>
+#include <unordered_set>
+#include <utility>
 
 #include "mace/core/future.h"
 #include "mace/core/macros.h"
@@ -53,6 +54,13 @@ std::string TransformedName(const std::string &input_name,
   ss << input_name << "_mem_type_" << mem_type;
   return ss.str();
 }
+
+bool TransformRequiredOp(const std::string &op_type) {
+  static const std::unordered_set<std::string> kNoTransformOp = {
+      "Shape", "InferConv2dShape"
+  };
+  return kNoTransformOp.count(op_type) == 0;
+}
 #endif  // MACE_ENABLE_OPENCL
 
 }  // namespace
@@ -72,6 +80,7 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
   // otherwise, fallback to CPU device.
   DeviceType device_type = DeviceType::CPU;
   construct_context->set_device(cpu_device_);
+  construct_context->set_operator_def(op_def);
   construct_context->set_output_mem_type(MemoryType::CPU_BUFFER);
   for (auto device : available_devices) {
     if (device == target_device_type) {
@@ -103,7 +112,6 @@ std::unique_ptr<Operation> SerialNet::CreateOperation(
       }
     }
   }
-  construct_context->set_operator_def(op_def);
   std::unique_ptr<Operation> op(
       op_registry->CreateOperation(construct_context, device_type));
   return std::move(op);
@@ -126,7 +134,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
   std::unordered_map<std::string, InternalOutputInfo> output_map;
   // used for memory optimization
   std::unordered_map<std::string, MemoryType> output_mem_map;
-  std::unordered_map<std::string, std::string> transformed_map;
+  std::unordered_set<std::string> transformed_set;
   // add input information
   MemoryType target_mem_type;
   // quantize model flag
@@ -180,71 +188,80 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
 #ifdef MACE_ENABLE_OPENCL
     // Add input transform operation if necessary
     if (target_device_->device_type() == DeviceType::GPU) {
-      const DataType dt =
-          static_cast<DataType>(
-              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
-                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
       // the outputs' memory type of the operation
       MemoryType out_mem_type = construct_context.output_mem_type();
       int input_size = op_def->input_size();
-      for (int i = 0; i < input_size; ++i) {
-        if (output_map.count(op_def->input(i)) == 1) {
-          // if op is memory-reuse op, no transformation
-          if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
-            out_mem_type = output_map.at(op_def->input(i)).mem_type;
-            break;
-          }
-          // check whether is the output tensor of other operation
-          if (output_map.at(op_def->input(i)).mem_type != out_mem_type ||
-              output_map.at(op_def->input(i)).dtype != dt) {
-            auto key = TransformedName(op_def->input(i), out_mem_type);
-            auto &output_info = output_map.at(op_def->input(i));
-            // check whether the tensor has been transformed
-            if (transformed_map.count(key) == 0) {
-              VLOG(1) << "Add Transform operation to transform tensor '"
-                      << op_def->input(i) << "', from memory type "
-                      << output_info.mem_type << " to " << out_mem_type
-                      << ", from Data Type " << output_info.dtype << " to "
-                      << dt;
-              std::string input_name = op_def->input(i);
-              std::string t_input_name =
-                  TransformedName(input_name,
-                                  out_mem_type);
-              op_def->set_input(i, t_input_name);
-              auto input_shape = output_info.shape;
-              if (output_info.mem_type == MemoryType::CPU_BUFFER &&
-                  input_shape.size() == 4) {
-                // NCHW -> NHWC
-                input_shape =
-                    TransposeShape<index_t, index_t>(input_shape,
-                                                     {0, 2, 3, 1});
+      // if op is memory-unused op, no transformation
+      if (TransformRequiredOp(op_def->type())) {
+        for (int i = 0; i < input_size; ++i) {
+          if (output_map.count(op_def->input(i)) == 1) {
+            // if op is memory-reuse op, no transformation
+            if (MemoryOptimizer::IsMemoryReuseOp(op_def->type())) {
+              out_mem_type = output_map.at(op_def->input(i)).mem_type;
+              break;
+            }
+            // check whether to do transform
+            MemoryType wanted_in_mem_type =
+                construct_context.GetInputMemType(i);
+            DataType wanted_in_dt = construct_context.GetInputDataType(i);
+            if (output_map.at(op_def->input(i)).mem_type != wanted_in_mem_type
+                || output_map.at(op_def->input(i)).dtype != wanted_in_dt) {
+              auto t_input_name = TransformedName(op_def->input(i),
+                                                  wanted_in_mem_type);
+              auto &output_info = output_map.at(op_def->input(i));
+              // check whether the tensor has been transformed
+              if (transformed_set.count(t_input_name) == 0) {
+                VLOG(1) << "Add Transform operation to transform tensor '"
+                        << op_def->input(i) << "', from memory type "
+                        << output_info.mem_type << " to "
+                        << wanted_in_mem_type
+                        << ", from Data Type " << output_info.dtype << " to "
+                        << wanted_in_dt;
+                std::string input_name = op_def->input(i);
+                op_def->set_input(i, t_input_name);
+                auto input_shape = output_info.shape;
+                if (output_info.mem_type == MemoryType::CPU_BUFFER &&
+                    input_shape.size() == 4) {
+                  // NCHW -> NHWC
+                  input_shape =
+                      TransposeShape<index_t, index_t>(input_shape,
+                                                       {0, 2, 3, 1});
+                }
+                auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
+                    input_name, input_shape, t_input_name,
+                    wanted_in_dt, wanted_in_mem_type);
+                auto transform_op = CreateOperation(
+                    op_registry,
+                    &construct_context,
+                    transform_op_def,
+                    data_format_flag);
+                operators_.emplace_back(std::move(transform_op));
+                transformed_set.insert(t_input_name);
+                output_mem_map[t_input_name] = wanted_in_mem_type;
+                // where to do graph reference count.
+                mem_optimizer->UpdateTensorRef(transform_op_def.get());
+              } else {
+                op_def->set_input(i, t_input_name);
               }
-              auto transform_op_def = OpenCLUtil::CreateTransformOpDef(
-                  input_name, input_shape, t_input_name,
-                  dt, out_mem_type);
-              auto transform_op = CreateOperation(
-                  op_registry,
-                  &construct_context,
-                  transform_op_def,
-                  data_format_flag);
-              operators_.emplace_back(std::move(transform_op));
-              transformed_map.emplace(key, t_input_name);
-              output_mem_map[t_input_name] = out_mem_type;
-              // where to do graph reference count.
-              mem_optimizer->UpdateTensorRef(transform_op_def.get());
-            } else {
-              op_def->set_input(i, transformed_map[key]);
             }
+          } else {
+            MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
+                           && ws_->GetTensor(op_def->input(i))->is_weight(),
+                       "Tensor ", op_def->input(i), " of ",
+                       op_def->name(), " not allocated");
           }
-        } else {
-          MACE_CHECK(ws_->GetTensor(op_def->input(i)) != nullptr
-                         && ws_->GetTensor(op_def->input(i))->is_weight(),
-                     "Tensor ", op_def->input(i), " of ",
-                     op_def->name(), " not allocated");
         }
       }
       // update the map : output_tensor -> Operation
       for (int out_idx = 0; out_idx < op_def->output_size(); ++out_idx) {
+        DataType dt;
+        if (op_def->output_type_size() == op_def->output_size()) {
+          dt = op_def->output_type(out_idx);
+        } else {
+          dt = static_cast<DataType>(
+              ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+                  *op_def, "T", static_cast<int>(DataType::DT_FLOAT)));
+        }
         output_mem_map[op_def->output(out_idx)] = out_mem_type;
         output_map.emplace(
             op_def->output(out_idx),
@@ -272,13 +289,13 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
       auto &internal_output_info = output_map.at(output_info.name());
       if ((internal_output_info.mem_type != target_mem_type &&
           internal_output_info.mem_type != MemoryType::CPU_BUFFER) ||
-          internal_output_info.dtype != DataType::DT_FLOAT) {
+          internal_output_info.dtype != output_info.data_type()) {
         VLOG(1) << "Add Transform operation to transform output tensor '"
                 << output_info.name() << "', from memory type "
                 << internal_output_info.mem_type
                 << " to " << target_mem_type
                 << ", from Data Type " << internal_output_info.dtype
-                << " to " << DataType::DT_FLOAT;
+                << " to " << output_info.data_type();
         std::string t_output_name = TransformedName(output_info.name(),
             target_mem_type);
         auto output_op_def =
@@ -298,7 +315,7 @@ SerialNet::SerialNet(const OpRegistryBase *op_registry,
             t_output_name,
             internal_output_info.shape,
             output_info.name(),
-            DataType::DT_FLOAT,
+            output_info.data_type(),
             target_mem_type);
         auto transform_op = CreateOperation(
             op_registry,
diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 6a437f884c506af231db882a500560bdd8dc67ec..ad88c35b2d0bc0b5a216148084783cc5941cf9d1 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -24,6 +24,57 @@ namespace mace {
 OpConstructContext::OpConstructContext(Workspace *ws)
     : operator_def_(nullptr), ws_(ws), device_(nullptr) {}
 
+void OpConstructContext::set_operator_def(
+    std::shared_ptr<mace::OperatorDef> operator_def) {
+  operator_def_ = operator_def;
+  input_data_types_.clear();
+}
+
+void OpConstructContext::set_output_mem_type(mace::MemoryType type) {
+  MACE_CHECK(operator_def_ != nullptr);
+  output_mem_type_ = type;
+  input_mem_types_.clear();
+}
+
+void OpConstructContext::SetInputInfo(size_t idx,
+                                      mace::MemoryType mem_type,
+                                      mace::DataType dt) {
+  if (input_mem_types_.empty()) {
+    // the default inputs' memory types are same as output memory type.
+    input_mem_types_.resize(operator_def_->input_size(), output_mem_type_);
+  }
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    DataType op_dt = static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+    input_data_types_.resize(operator_def_->input_size(), op_dt);
+  }
+  MACE_CHECK(idx < input_mem_types_.size() && idx < input_data_types_.size());
+  input_mem_types_[idx] = mem_type;
+  input_data_types_[idx] = dt;
+}
+
+MemoryType OpConstructContext::GetInputMemType(size_t idx) const {
+  if (input_mem_types_.empty()) {
+    return output_mem_type_;
+  }
+  MACE_CHECK(idx < input_mem_types_.size(),
+             idx, " < ", input_mem_types_.size());
+  return input_mem_types_[idx];
+}
+
+DataType OpConstructContext::GetInputDataType(size_t idx) const {
+  if (input_data_types_.empty()) {
+    // the default inputs' data types are same as operation's data type.
+    return static_cast<DataType>(
+        ProtoArgHelper::GetOptionalArg<OperatorDef, int>(
+            *operator_def_, "T", static_cast<int>(DataType::DT_FLOAT)));
+  }
+  MACE_CHECK(idx < input_data_types_.size());
+  return input_data_types_[idx];
+}
+
 OpInitContext::OpInitContext(Workspace *ws, Device *device)
     : ws_(ws), device_(device) {}
 
diff --git a/mace/core/operator.h b/mace/core/operator.h
index 8d3e1557bd5673ea07ddc4b3008711e43a8e27c2..5a119d1ee0cde520ac1820117080c7d0a19bc52b 100644
--- a/mace/core/operator.h
+++ b/mace/core/operator.h
@@ -35,9 +35,7 @@ class OpConstructContext {
   explicit OpConstructContext(Workspace *ws);
   ~OpConstructContext() = default;
 
-  inline void set_operator_def(std::shared_ptr<OperatorDef> operator_def) {
-    operator_def_ = operator_def;
-  }
+  void set_operator_def(std::shared_ptr<OperatorDef> operator_def);
 
   inline std::shared_ptr<OperatorDef> operator_def() const {
     return operator_def_;
@@ -55,19 +53,26 @@ class OpConstructContext {
     return device_;
   }
 
-  inline void set_output_mem_type(MemoryType type) {
-    output_mem_type_ = type;
-  }
+  void set_output_mem_type(MemoryType type);
 
   inline MemoryType output_mem_type() const {
     return output_mem_type_;
   }
 
+  void SetInputInfo(size_t idx, MemoryType mem_type, DataType dt);
+
+  MemoryType GetInputMemType(size_t idx) const;
+
+  DataType GetInputDataType(size_t idx) const;
+
  private:
   std::shared_ptr<OperatorDef> operator_def_;
   Workspace *ws_;
   Device *device_;
-  MemoryType output_mem_type_;  // used for transform memory
+  // used for memory transform
+  std::vector<MemoryType> input_mem_types_;
+  std::vector<DataType> input_data_types_;
+  MemoryType output_mem_type_;  // there is only one output memory type now.
 };
 
 // memory_optimizer, device
@@ -93,6 +98,12 @@ class OpInitContext {
   Device *device_;
 };
 
+// Conventions
+// * If there exist format, NHWC is the default format
+// * The input/output format of CPU ops with float data type is NCHW
+// * The input/output format of GPU ops and CPU Quantization ops is NHWC
+// * Inputs' data type is same as the operation data type by default.
+// * The outputs' data type is same as the operation data type by default.
 class Operation {
  public:
   explicit Operation(OpConstructContext *context);
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index eb21ef2c3e596ba28ce4178574dcb74db59a434f..d94b208d7bff66b36e3b179d7f33c471a17e8c8b 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -612,11 +612,9 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
                                const std::vector<int> &dilations,
                                const int wino_blk_size = 0) {
   testing::internal::LogToStderr();
-  srand(time(NULL));
 
   auto func = [&](index_t batch, int stride_h, int stride_w, Padding padding) {
     // generate random input
-    static unsigned int seed = time(NULL);
     index_t height = input_shape[0];
     index_t width = input_shape[1];
     index_t kernel_h = filter_shape[0];
diff --git a/mace/ops/deconv_2d.cc b/mace/ops/deconv_2d.cc
index 0b11667e39843378d7b58e86abefb15fa76fae89..5697c8413544742ad1517154c84511f9031cbabb 100644
--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -375,10 +375,16 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
             context, operator_def_.get(), 2,
             OpenCLBufferType::ARGUMENT, mem_type) == MaceStatus::MACE_SUCCESS);
       }
-    } else if (operator_def_->input_size() >= 4) {
-      MACE_CHECK(TransformFilter<T>(
-          context, operator_def_.get(), 3, OpenCLBufferType::ARGUMENT, mem_type)
-                     == MaceStatus::MACE_SUCCESS);
+    } else {
+      if (operator_def_->input_size() >= 4) {
+        MACE_CHECK(TransformFilter<T>(
+            context,
+            operator_def_.get(),
+            3,
+            OpenCLBufferType::ARGUMENT,
+            mem_type) == MaceStatus::MACE_SUCCESS);
+      }
+      context->SetInputInfo(2, MemoryType::CPU_BUFFER, DataType::DT_INT32);
     }
   }
   MaceStatus Run(OpContext *context) override {
diff --git a/mace/ops/ops_test_util.cc b/mace/ops/ops_test_util.cc
index 21407c6a743491820d431e077d01e30aa629ac9b..6b08761e34eec22992db490c21740865bdfe3660 100644
--- a/mace/ops/ops_test_util.cc
+++ b/mace/ops/ops_test_util.cc
@@ -166,10 +166,15 @@ bool OpsTestNet::Setup(mace::DeviceType device) {
       }
     }
 
-    for (auto output : op_def_.output()) {
-      ws_.RemoveTensor(output);
+    for (int i = 0; i < op_def_.output_size(); ++i) {
+      ws_.RemoveTensor(op_def_.output(i));
       auto output_info = net_def.add_output_info();
-      output_info->set_name(output);
+      output_info->set_name(op_def_.output(i));
+      if (op_def_.output_type_size() == op_def_.output_size()) {
+        output_info->set_data_type(op_def_.output_type(i));
+      } else {
+        output_info->set_data_type(DataType::DT_FLOAT);
+      }
     }
   }
   MemoryOptimizer mem_optimizer;
diff --git a/mace/ops/scalar_math.cc b/mace/ops/scalar_math.cc
index 5539e53f83be152a839e9bfa98178c2fedb933c6..297dcb33700d5c258676f28fa954f9692831ba0a 100644
--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
@@ -21,6 +21,7 @@
 namespace mace {
 namespace ops {
 
+namespace {
 template <typename T, typename DstType>
 void ScalarEltwise(const T* in0,
                    const T* in1,
@@ -81,6 +82,7 @@ void ScalarEltwise(const T* in0,
       LOG(FATAL) << "Eltwise op not support type " << type;
   }
 }
+}  // namespace
 
 
 template <DeviceType D, typename T>
@@ -156,12 +158,6 @@ void RegisterScalarMath(OpRegistryBase *op_registry) {
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
                    DeviceType::CPU, int32_t);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "ScalarMath", ScalarMathOp,
-                   DeviceType::GPU, int32_t);
-#endif  // MACE_ENABLE_OPENCL
 }
 
 }  // namespace ops
diff --git a/mace/ops/scalar_math_test.cc b/mace/ops/scalar_math_test.cc
index b9d8fd0b59de9df82cd9cfa17683d75ee643de08..743dd2565d6d6226b873e73fcfb7f2299a9dbfc2 100644
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
@@ -79,30 +79,6 @@ TEST_F(ScalarMathOpTest, SimpleCPU) {
       ops::EltwiseType::EQUAL, 3, 3, 1, 1);
 }
 
-TEST_F(ScalarMathOpTest, SimpleGPU) {
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::SUM, 1, 2, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::SUB, 1, 2, 1, -1);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::PROD, 3, -2, 1, -6);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::DIV, 3, -2, 1, -1.5);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::MIN, 3, -2, 1, -2);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::MAX, 3, -2, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::NEG, 3, -2, 1, -3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::ABS, 3, -2, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
-  ScalarMathTest<DeviceType::GPU, float, float>(
-      ops::EltwiseType::POW, 3, 1, 1, 3);
-  ScalarMathTest<DeviceType::GPU, float, int32_t>(
-      ops::EltwiseType::EQUAL, 3, 3, 1, 1);
-}
 }  // namespace test
 }  // namespace ops
 }  // namespace mace
diff --git a/mace/ops/shape.cc b/mace/ops/shape.cc
index 675ab7c82a7fa553d9ec69cd6f4a77b68f5ceb98..58031ae098583d6be2108d791dfedf44cbfd8968 100644
--- a/mace/ops/shape.cc
+++ b/mace/ops/shape.cc
@@ -21,11 +21,7 @@ template <DeviceType D, typename T>
 class ShapeOp : public Operation {
  public:
   explicit ShapeOp(OpConstructContext *context)
-      : Operation(context) {
-    if (D == DeviceType::GPU) {
-      context->set_output_mem_type(MemoryType::GPU_BUFFER);
-    }
-  }
+      : Operation(context) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -66,12 +62,6 @@ class ShapeOp : public Operation {
 void RegisterShape(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
                    DeviceType::CPU, float);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Shape", ShapeOp,
-                   DeviceType::GPU, half);
-#endif  // MACE_ENABLE_OPENCL
 }
 
 }  // namespace ops
diff --git a/mace/ops/stack.cc b/mace/ops/stack.cc
index f6269b0f4a08d471a0e25efbe3374142e5a9e20c..97719f18dc41dfd73bd9861901a497a54594303b 100644
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
@@ -25,11 +25,7 @@ class StackOp : public Operation {
  public:
   explicit StackOp(OpConstructContext *context)
       : Operation(context),
-        axis_(Operation::GetOptionalArg<int>("axis", 0)) {
-    if (D == DeviceType::GPU) {
-      context->set_output_mem_type(MemoryType::GPU_BUFFER);
-    }
-  }
+        axis_(Operation::GetOptionalArg<int>("axis", 0)) {}
 
   MaceStatus Run(OpContext *context) override {
     MACE_UNUSED(context);
@@ -54,6 +50,7 @@ class StackOp : public Operation {
     }
 
     // Output is on host, no need to map data
+    Tensor::MappingGuard output_guard(output);
     auto *output_data = output->mutable_data<T>();
     std::vector<const T *> input_data(inputs.size());
     for (size_t i = 0; i < inputs.size(); ++i) {
@@ -83,10 +80,6 @@ class StackOp : public Operation {
 void RegisterStack(OpRegistryBase *op_registry) {
   MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::CPU, int32_t);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "Stack", StackOp, DeviceType::GPU, int32_t);
-#endif  // MACE_ENABLE_OPENCL
 }
 
 }  // namespace ops
diff --git a/mace/ops/strided_slice.cc b/mace/ops/strided_slice.cc
index 7c60bfe89faf5c091caa2f77420753315682e8c7..b3b53ec859e704328793394437e44160d36c7c76 100644
--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -217,12 +217,6 @@ void RegisterStridedSlice(OpRegistryBase *op_registry) {
                    DeviceType::CPU, float);
   MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
                    DeviceType::CPU, int32_t);
-#ifdef MACE_ENABLE_OPENCL
-  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
-                   DeviceType::GPU, float);
-  MACE_REGISTER_OP(op_registry, "StridedSlice", StridedSliceOp,
-                   DeviceType::GPU, int32_t);
-#endif  // MACE_ENABLE_OPENCL
 }
 
 }  // namespace ops
diff --git a/tools/converter.py b/tools/converter.py
index e98715fc95def1972c376c76c211758b19c6b2b2..0e9202f1066f87805f52050ca50ff0f9a7042cf0 100644
--- a/tools/converter.py
+++ b/tools/converter.py
@@ -741,7 +741,7 @@ def download_file(url, dst, num_retries=3):
         MaceLogger.info('\nDownloaded successfully.')
     except (urllib.error.ContentTooShortError, urllib.error.HTTPError,
             urllib.error.URLError) as e:
-        MaceLogger.warning('Download error:', e)
+        MaceLogger.warning('Download error:' + str(e))
         if num_retries > 0:
             return download_file(url, dst, num_retries - 1)
         else: