support stack/stridedslice/scalar eltwise on gpu runtime

8828ddb0 · liutuo · 577baf1b · 8828ddb0 · 8828ddb0 · 8828ddb0
22 changed file
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -124,6 +124,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,

        tensor_map_[const_tensor.name()] = std::move(tensor);
      }
+      fused_buffer_ = false;
    } else {
 #else
    {
@@ -165,6 +166,7 @@ MaceStatus Workspace::LoadModelTensor(const NetDef &net_def,
        tensor->SetZeroPoint(const_tensor.zero_point());
        tensor_map_[const_tensor.name()] = std::move(tensor);
      }
+      fused_buffer_ = true;
    }
  }

@@ -327,7 +329,34 @@ void Workspace::RemoveUnusedBuffer() {
      tensor_map_.erase(old_iter);
    }
  }
+  tensor_buffer_.reset(nullptr);
+}
+
+void Workspace::RemoveAndReloadBuffer(const NetDef &net_def,
+                                      const unsigned char *model_data) {
+  for (auto &const_tensor : net_def.tensors()) {
+    auto iter = tensor_map_.find(const_tensor.name());
+    if (iter->second->unused()) {
+      tensor_map_.erase(iter);
+    } else if (fused_buffer_) {
+      tensor_map_.erase(iter);
+      std::vector<index_t> dims;
+      for (const index_t d : const_tensor.dims()) {
+        dims.push_back(d);
+      }
+      std::unique_ptr<Tensor> tensor(
+          new Tensor(GetDeviceAllocator(DeviceType::GPU),
+                     const_tensor.data_type()));
+      tensor->Resize(dims);
+      MACE_CHECK(tensor->size() == const_tensor.data_size(),
+                 "Tensor's data_size not equal with the shape");
+      tensor->CopyBytes(model_data + const_tensor.offset(),
+                        const_tensor.data_size() *
+                            GetEnumTypeSize(const_tensor.data_type()));

+      tensor_map_[const_tensor.name()] = std::move(tensor);
+    }
+  }
  tensor_buffer_.reset(nullptr);
 }


--- a/mace/core/workspace.h
+++ b/mace/core/workspace.h
@@ -55,6 +55,9 @@ class Workspace {

  void RemoveUnusedBuffer();

+  void RemoveAndReloadBuffer(const NetDef &net_def,
+                             const unsigned char *model_data);
+
 private:
  MaceStatus CreateOutputTensorBuffer(const NetDef &net_def,
                                      DeviceType device_type);
@@ -66,6 +69,7 @@ class Workspace {
  PreallocatedPooledAllocator preallocated_allocator_;

  std::unique_ptr<ScratchBuffer> host_scratch_buffer_;
+  bool fused_buffer_;

  MACE_DISABLE_COPY_AND_ASSIGN(Workspace);
 };

--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -174,15 +174,15 @@ struct Deconv2dFunctorBase {
    switch (padding) {
      case VALID:
        expected_input_height =
-            (out_height - filter_h) / strides[0] + 1;
+            (out_height - filter_h + strides[0]) / strides[0];
        expected_input_width =
-            (out_width - filter_w) / strides[1] + 1;
+            (out_width - filter_w + strides[1]) / strides[1];
        break;
      case SAME:
        expected_input_height =
-            (out_height - 1) / strides[0] + 1;
+            (out_height + strides[0] - 1) / strides[0];
        expected_input_width =
-            (out_width - 1) / strides[1] + 1;
+            (out_width + strides[1] - 1) / strides[1];
        break;
      default:
        MACE_CHECK(false, "Unsupported padding type: ", padding);

--- a/mace/kernels/eltwise.h
+++ b/mace/kernels/eltwise.h
@@ -805,13 +805,19 @@ inline void TensorEltwisePerChannel(const EltwiseType type,
 struct EltwiseFunctorBase {
  EltwiseFunctorBase(const EltwiseType type,
                     const std::vector<float> &coeff,
-                     const float value,
+                     const float scalar_input,
+                     const int32_t scalar_input_index,
                     const DataFormat data_format)
-      : type_(type), coeff_(coeff), value_(value), data_format_(data_format) {}
+      : type_(type),
+        coeff_(coeff),
+        scalar_input_(scalar_input),
+        scalar_input_index_(scalar_input_index),
+        data_format_(data_format) {}

  EltwiseType type_;
  std::vector<float> coeff_;
-  float value_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
  DataFormat data_format_;
 };

@@ -819,9 +825,14 @@ template <DeviceType D, typename T>
 struct EltwiseFunctor : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type,
                 const std::vector<float> &coeff,
-                 const float value,  // keep it float as it comes from arg
+                 const float scalar_input,  // float as it comes from arg
+                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(type, coeff, value, data_format) {}
+      : EltwiseFunctorBase(type,
+                           coeff,
+                           scalar_input,
+                           scalar_input_index,
+                           data_format) {}

  template <typename DstType>
  MaceStatus DoEltwise(const Tensor *input0,
@@ -832,6 +843,9 @@ struct EltwiseFunctor : EltwiseFunctorBase {
      std::swap(input0, input1);
      swapped = true;
    }
+    if (scalar_input_index_ == 0) {
+      swapped = !swapped;
+    }

    // check if we can broadcast tensor
    uint32_t rank_diff =
@@ -924,7 +938,7 @@ struct EltwiseFunctor : EltwiseFunctorBase {
      scalar_tensor_.Resize({});
      Tensor::MappingGuard guard(&scalar_tensor_);
      auto scalar_data = scalar_tensor_.mutable_data<T>();
-      scalar_data[0] = static_cast<T>(value_);
+      scalar_data[0] = static_cast<T>(scalar_input_);
      input1 = &scalar_tensor_;
    }

@@ -944,9 +958,14 @@ template <typename T>
 struct EltwiseFunctor<DeviceType::GPU, T> : EltwiseFunctorBase {
  EltwiseFunctor(const EltwiseType type,
                 const std::vector<float> &coeff,
-                 const float value,
+                 const float scalar_input,
+                 const int32_t scalar_input_index,
                 const DataFormat data_format)
-      : EltwiseFunctorBase(type, coeff, value, data_format) {}
+      : EltwiseFunctorBase(type,
+                           coeff,
+                           scalar_input,
+                           scalar_input_index,
+                           data_format) {}

  MaceStatus operator()(const Tensor *input0,
                        const Tensor *input1,

--- a/mace/kernels/opencl/deconv_2d.cc
+++ b/mace/kernels/opencl/deconv_2d.cc
@@ -152,7 +152,6 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
  MACE_CHECK_NOTNULL(input);
  MACE_CHECK_NOTNULL(filter);
  MACE_CHECK_NOTNULL(output);
-
  if (!from_caffe_) {
    if (output_shape_.size() != 4) {
      MACE_CHECK_NOTNULL(output_shape_tensor);
@@ -174,7 +173,6 @@ MaceStatus Deconv2dFunctor<DeviceType::GPU, T>::operator()(
    CalcDeconvOutputSize(input->shape().data(), filter->shape().data(),
                         strides_, output_shape_.data(), paddings_.data());
  }
-
  std::vector<size_t> output_image_shape;
  CalImage2DShape(output_shape_, BufferType::IN_OUT_CHANNEL,
                  &output_image_shape);

--- a/mace/kernels/opencl/eltwise.cc
+++ b/mace/kernels/opencl/eltwise.cc
@@ -48,6 +48,10 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
    }
  }

+  if (scalar_input_index_ == 0) {
+    swapped = !swapped;
+  }
+
  std::vector<index_t> output_shape(4);
  output_shape[0] = input0->dim(0);
  output_shape[1] = input0->dim(1);
@@ -104,7 +108,7 @@ MaceStatus EltwiseFunctor<DeviceType::GPU, T>::operator()(const Tensor *input0,
    SET_3D_GWS_ARGS(kernel_);
    kernel_.setArg(idx++, *(input0->opencl_image()));
    if (input1 == nullptr) {
-      kernel_.setArg(idx++, value_);
+      kernel_.setArg(idx++, scalar_input_);
    } else {
      kernel_.setArg(idx++, *(input1->opencl_image()));
    }

--- a/mace/kernels/scalar_math.h
+++ b/mace/kernels/scalar_math.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_KERNELS_SCALAR_MATH_H_
+#define MACE_KERNELS_SCALAR_MATH_H_
+
+#include <algorithm>
+#include <vector>
+
+#include "mace/core/future.h"
+#include "mace/core/tensor.h"
+#include "mace/public/mace.h"
+#include "mace/kernels/eltwise.h"
+
+namespace mace {
+namespace kernels {
+
+template <typename T, typename DstType>
+void ScalarEltwise(const T* in0,
+                   const T* in1,
+                   const EltwiseType type,
+                   const std::vector<float> &coeff,
+                   const bool swapped,
+                   DstType* out) {
+  switch (type) {
+    case SUM:
+      if (coeff.empty()) {
+        out[0] = in0[0] + in1[0];
+      } else {
+        MACE_CHECK(coeff.size() == 2,
+                   "sum's coeff params' size should be 2.");
+        if (swapped)
+          out[0] = in0[0] * coeff[1] + in1[0] * coeff[0];
+        else
+          out[0] = in0[0] * coeff[0] + in1[0] * coeff[1];
+      }
+      break;
+    case SUB:
+      if (swapped)
+        out[0] = in1[0] - in0[0];
+      else
+        out[0] = in0[0] - in1[0];
+      break;
+    case PROD:
+      out[0] = in0[0] * in1[0];
+      break;
+    case DIV:
+      if (swapped)
+        out[0] = in1[0] / in0[0];
+      else
+        out[0] = in0[0] / in1[0];
+      break;
+    case MIN:
+      out[0] = std::min(in1[0], in0[0]);
+      break;
+    case MAX:
+      out[0] = std::max(in1[0], in0[0]);
+      break;
+    case SQR_DIFF:
+      out[0] = std::pow(in1[0] - in0[0], 2.f);
+      break;
+    case POW:
+      out[0] = std::pow(in0[0], in1[0]);
+      break;
+    case EQUAL:
+      out[0] = in1[0] == in0[0];
+      break;
+    case NEG:
+      out[0] = -in0[0];
+      break;
+    case ABS:
+      out[0] = in0[0] > 0 ? in0[0] : -in0[0];
+      break;
+    default:
+      LOG(FATAL) << "Eltwise op not support type " << type;
+  }
+}
+
+
+template <DeviceType D, typename T>
+struct ScalarMathFunctor {
+  explicit ScalarMathFunctor(const EltwiseType type,
+                             const std::vector<float> &coeff,
+                             const float scalar_input,
+                             const int32_t scalar_input_index)
+      : type_(type),
+        coeff_(coeff),
+        scalar_input_(scalar_input),
+        scalar_input_index_(scalar_input_index) {}
+
+  MaceStatus operator()(const std::vector<const Tensor *> &inputs,
+                        Tensor *output,
+                        StatsFuture *future) {
+    const Tensor* input0 = inputs[0];
+    const Tensor* input1 = (inputs.size() >= 2) ? inputs[1] : nullptr;
+    MACE_CHECK(input0->dim_size() <= 1 && input0->size() == 1,
+               "not support input dim size") << input0->dim_size();
+    Tensor::MappingGuard in0_guard(input0);
+    const T* in0 = input0->data<T>();
+    auto v = static_cast<T>(scalar_input_);
+    const T* in1 = &v;
+    Tensor::MappingGuard in1_guard(input1);
+    if (input1) {
+      MACE_CHECK(input1->dim_size() == 0);
+      in1 = input1->data<T>();
+    }
+    if (input0->dim_size() > 0) {
+      MACE_RETURN_IF_ERROR(output->Resize(input0->shape()));
+    } else {
+      output->Resize({});
+    }
+
+    Tensor::MappingGuard output_guard(output);
+    bool swapped = scalar_input_index_ == 0;
+
+    if (IsLogicalType(type_)) {
+      int32_t* out = output->mutable_data<int32_t>();
+      ScalarEltwise<T, int32_t>(in0,
+                                in1,
+                                type_,
+                                coeff_,
+                                swapped,
+                                out);
+    } else {
+      T* out = output->mutable_data<T>();
+      ScalarEltwise<T, T>(in0,
+                          in1,
+                          type_,
+                          coeff_,
+                          swapped,
+                          out);
+    }
+
+    SetFutureDefaultWaitFn(future);
+    return MACE_SUCCESS;
+  }
+
+  EltwiseType type_;
+  std::vector<float> coeff_;
+  float scalar_input_;
+  int32_t scalar_input_index_;
+};
+
+}  // namespace kernels
+}  // namespace mace
+
+#endif  // MACE_KERNELS_SCALAR_MATH_H_
--- a/mace/kernels/stack.h
+++ b/mace/kernels/stack.h
@@ -46,7 +46,13 @@ struct StackFunctor {
    output_shape.insert(output_shape.begin() + axis_, inputs.size());
    MACE_RETURN_IF_ERROR(output->Resize(output_shape));

-    // On host, no need to map data
+    // Some inputs may be in gpu memory, so add mapping here.
+    std::vector<Tensor::MappingGuard> mappers;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      mappers.emplace_back(Tensor::MappingGuard(inputs[i]));
+    }
+
+    // Output is on host, no need to map data
    T *output_data = output->mutable_data<T>();
    std::vector<const T *> input_data(inputs.size());
    for (size_t i = 0; i < inputs.size(); ++i) {

--- a/mace/kernels/strided_slice.h
+++ b/mace/kernels/strided_slice.h
@@ -51,7 +51,6 @@ struct StridedSliceFunctor {
                        StatsFuture *future) {
    MACE_CHECK(ellipsis_mask_ == 0 && new_axis_mask_ == 0,
               "ellipsis_mask and new_axis_mask are not supported yet.");
-
    if (strides == nullptr) {
      tmp_strides_tensor_.Resize({begin_indices->size()});
      Tensor::MappingGuard strides_guard(&tmp_strides_tensor_);
@@ -68,7 +67,6 @@ struct StridedSliceFunctor {
    const int32_t *begin_indices_data = begin_indices->data<int32_t>();
    const int32_t *end_indices_data = end_indices->data<int32_t>();
    const int32_t *strides_data = strides->data<int32_t>();
-
    std::vector<int32_t> pad_begin_indices(input->dim_size(), 0);
    std::vector<int32_t> pad_end_indices(input->dim_size(), 0);
    std::vector<int32_t> pad_strides_indices(input->dim_size(), 1);

--- a/mace/libmace/mace.cc
+++ b/mace/libmace/mace.cc
@@ -267,7 +267,7 @@ MaceStatus MaceEngine::Impl::Init(
  }
 #endif
  if (device_type_ == DeviceType::GPU) {
-    ws_->RemoveUnusedBuffer();
+    ws_->RemoveAndReloadBuffer(*net_def, model_data);
  }
  return MaceStatus::MACE_SUCCESS;
 }

--- a/mace/ops/eltwise.h
+++ b/mace/ops/eltwise.h
@@ -30,7 +30,8 @@ class EltwiseOp : public Operator<D, T> {
            static_cast<kernels::EltwiseType>(OperatorBase::GetOptionalArg<int>(
                "type", static_cast<int>(kernels::EltwiseType::NONE))),
            OperatorBase::GetRepeatedArgs<float>("coeff"),
-            OperatorBase::GetOptionalArg<float>("value", 1.0),
+            OperatorBase::GetOptionalArg<float>("scalar_input", 1.0),
+            OperatorBase::GetOptionalArg<int32_t>("scalar_input_index", 1),
            static_cast<DataFormat>(OperatorBase::GetOptionalArg<int>(
                "data_format", 0))) {}


--- a/mace/ops/eltwise_test.cc
+++ b/mace/ops/eltwise_test.cc
@@ -39,7 +39,7 @@ void SimpleScalarScalar(const kernels::EltwiseType type,
        .Input("Input")
        .AddIntArg("T", DataTypeToEnum<T>::v())
        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatArg("value", x)
+        .AddFloatArg("scalar_input", x)
        .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
        .Output("Output")
        .Finalize(net.NewOperatorDef());
@@ -72,7 +72,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
        .Input("TInput")
        .AddIntArg("T", DataTypeToEnum<T>::v())
        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatArg("value", x)
+        .AddFloatArg("scalar_input", x)
        .AddIntArg("data_format", DataFormat::NCHW)
        .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
        .Output("TOutput")
@@ -86,7 +86,7 @@ void SimpleTensorScalar(const kernels::EltwiseType type,
    OpDefBuilder("Eltwise", "EltwiseTest")
        .Input("InputImg")
        .AddIntArg("type", static_cast<int>(type))
-        .AddFloatArg("value", x)
+        .AddFloatArg("scalar_input", x)
        .Output("OutputImg")
        .Finalize(net.NewOperatorDef());

@@ -468,7 +468,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("TInput")
      .AddIntArg("type", static_cast<int>(type))
-      .AddFloatArg("value", 0.1)
+      .AddFloatArg("scalar_input", 0.1)
      .AddIntArg("data_format", DataFormat::NCHW)
      .Output("TOutput")
      .Finalize(net.NewOperatorDef());
@@ -484,7 +484,7 @@ void RandomTensorScalar(const kernels::EltwiseType type,
  OpDefBuilder("Eltwise", "EltwiseTest")
      .Input("InputImg")
      .AddIntArg("type", static_cast<int>(type))
-      .AddFloatArg("value", 0.1)
+      .AddFloatArg("scalar_input", 0.1)
      .Output("OutputImg")
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());

--- a/mace/ops/ops_register.cc
+++ b/mace/ops/ops_register.cc
@@ -48,6 +48,7 @@ extern void Register_Quantize(OperatorRegistryBase *op_registry);
 extern void Register_ReduceMean(OperatorRegistryBase *op_registry);
 extern void Register_Reshape(OperatorRegistryBase *op_registry);
 extern void Register_ResizeBilinear(OperatorRegistryBase *op_registry);
+extern void Register_ScalarMath(OperatorRegistryBase *op_registry);
 extern void Register_Shape(OperatorRegistryBase *op_registry);
 extern void Register_Split(OperatorRegistryBase *op_registry);
 extern void Register_Softmax(OperatorRegistryBase *op_registry);
@@ -99,6 +100,7 @@ OperatorRegistry::OperatorRegistry() : OperatorRegistryBase() {
  ops::Register_ReduceMean(this);
  ops::Register_Reshape(this);
  ops::Register_ResizeBilinear(this);
+  ops::Register_ScalarMath(this);
  ops::Register_Shape(this);
  ops::Register_Split(this);
  ops::Register_Softmax(this);

--- a/mace/ops/scalar_math.cc
+++ b/mace/ops/scalar_math.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/ops/scalar_math.h"
+
+namespace mace {
+namespace ops {
+
+void Register_ScalarMath(OperatorRegistryBase *op_registry) {
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<float>("T")
+                                          .Build(),
+                         ScalarMathOp<DeviceType::CPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
+                                          .Device(DeviceType::CPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         ScalarMathOp<DeviceType::CPU, int32_t>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
+                                          .Device(DeviceType::GPU)
+                                          .TypeConstraint<float>("T")
+                                          .Build(),
+                         ScalarMathOp<DeviceType::GPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("ScalarMath")
+                                          .Device(DeviceType::GPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         ScalarMathOp<DeviceType::GPU, int32_t>);
+}
+
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/scalar_math.h
+++ b/mace/ops/scalar_math.h
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MACE_OPS_SCALAR_MATH_H_
+#define MACE_OPS_SCALAR_MATH_H_
+
+#include <vector>
+
+#include "mace/core/operator.h"
+#include "mace/kernels/scalar_math.h"
+
+namespace mace {
+namespace ops {
+
+template <DeviceType D, typename T>
+class ScalarMathOp : public Operator<D, T> {
+ public:
+  ScalarMathOp(const OperatorDef &op_def, Workspace *ws)
+      : Operator<D, T>(op_def, ws),
+        functor_(static_cast<kernels::EltwiseType>(
+                   OperatorBase::GetOptionalArg<int>(
+                       "type", static_cast<int>(kernels::EltwiseType::NONE))),
+                 OperatorBase::GetRepeatedArgs<float>("coeff"),
+                 OperatorBase::GetOptionalArg<float>("scalar_input", 1.0),
+                 OperatorBase::GetOptionalArg<int32_t>(
+                     "scalar_input_index", 1)) {}
+
+  MaceStatus Run(StatsFuture *future) override {
+    const std::vector<const Tensor *> input_list = this->Inputs();
+    Tensor *output = this->Output(0);
+    return functor_(input_list, output, future);
+  }
+
+ private:
+  kernels::ScalarMathFunctor<D, T> functor_;
+};
+
+}  // namespace ops
+}  // namespace mace
+
+#endif  // MACE_OPS_SCALAR_MATH_H_
--- a/mace/ops/scalar_math_test.cc
+++ b/mace/ops/scalar_math_test.cc
+// Copyright 2018 Xiaomi, Inc.  All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mace/core/operator.h"
+#include "mace/ops/ops_test_util.h"
+#include "mace/kernels/eltwise.h"
+
+namespace mace {
+namespace ops {
+namespace test {
+
+class ScalarMathOpTest : public OpsTestBase {};
+
+namespace {
+template <DeviceType D, typename T, typename DstType>
+void ScalarMathTest(const kernels::EltwiseType type,
+                    const T input0,
+                    const T input1,
+                    const float x,
+                    const DstType output) {
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, T>("Input0", {}, {input0});
+  net.AddInputFromArray<D, T>("Input1", {}, {input1});
+
+  OpDefBuilder("ScalarMath", "ScalarMathTest")
+      .Input("Input0")
+      .Input("Input1")
+      .AddIntArg("T", DataTypeToEnum<T>::v())
+      .AddIntArg("type", static_cast<int>(type))
+      .AddFloatArg("scalar_input", x)
+      .OutputType({kernels::IsLogicalType(type) ? DT_INT32 : DT_FLOAT})
+      .Output("Output")
+      .Finalize(net.NewOperatorDef());
+  // Run
+  net.RunOp(D);
+
+
+  auto expected = CreateTensor<DstType>({}, {output});
+
+  ExpectTensorNear<DstType>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+}  // namespace
+
+TEST_F(ScalarMathOpTest, SimpleCPU) {
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::SUM, 1, 2, 3, 3);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::SUB, 1, 2, 3, -1);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::PROD, 3, -2, 3, -6);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::MIN, 3, -2, 1, -2);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::MAX, 3, -2, 1, 3);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::NEG, 3, -2, 1, -3);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::ABS, 3, -2, 1, 3);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+ScalarMathTest<DeviceType::CPU, float, float>(
+    kernels::EltwiseType::POW, 3, 1, 1, 3);
+ScalarMathTest<DeviceType::CPU, float, int32_t>(
+    kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+}
+
+TEST_F(ScalarMathOpTest, SimpleGPU) {
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::SUM, 1, 2, 1, 3);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::SUB, 1, 2, 1, -1);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::PROD, 3, -2, 1, -6);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::DIV, 3, -2, 1, -1.5);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::MIN, 3, -2, 1, -2);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::MAX, 3, -2, 1, 3);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::NEG, 3, -2, 1, -3);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::ABS, 3, -2, 1, 3);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::SQR_DIFF, 3, -2, 1, 25);
+ScalarMathTest<DeviceType::GPU, float, float>(
+    kernels::EltwiseType::POW, 3, 1, 1, 3);
+ScalarMathTest<DeviceType::GPU, float, int32_t>(
+    kernels::EltwiseType::EQUAL, 3, 3, 1, 1);
+}
+}  // namespace test
+}  // namespace ops
+}  // namespace mace
--- a/mace/ops/stack.cc
+++ b/mace/ops/stack.cc
@@ -28,6 +28,16 @@ void Register_Stack(OperatorRegistryBase *op_registry) {
                                          .TypeConstraint<int32_t>("T")
                                          .Build(),
                         StackOp<DeviceType::CPU, int32_t>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack")
+                                          .Device(DeviceType::GPU)
+                                          .TypeConstraint<float>("T")
+                                          .Build(),
+                         StackOp<DeviceType::GPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("Stack")
+                                          .Device(DeviceType::GPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         StackOp<DeviceType::GPU, int32_t>);
 }

 }  // namespace ops

--- a/mace/ops/strided_slice.cc
+++ b/mace/ops/strided_slice.cc
@@ -28,6 +28,16 @@ void Register_StridedSlice(OperatorRegistryBase *op_registry) {
                                          .TypeConstraint<int32_t>("T")
                                          .Build(),
                         StridedSliceOp<DeviceType::CPU, int32_t>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice")
+                                          .Device(DeviceType::GPU)
+                                          .TypeConstraint<float>("T")
+                                          .Build(),
+                         StridedSliceOp<DeviceType::GPU, float>);
+  MACE_REGISTER_OPERATOR(op_registry, OpKeyBuilder("StridedSlice")
+                                          .Device(DeviceType::GPU)
+                                          .TypeConstraint<int32_t>("T")
+                                          .Build(),
+                         StridedSliceOp<DeviceType::GPU, int32_t>);
 }

 }  // namespace ops

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -101,6 +101,7 @@ MaceSupportedOps = [
    'ReduceMean',
    'Reshape',
    'ResizeBilinear',
+    'ScalarMath',
    'Slice',
    'Split',
    'Shape',
@@ -153,7 +154,7 @@ class MaceKeyword(object):
    mace_shape_str = 'shape'
    mace_winograd_filter_transformed = 'is_filter_transformed'
    mace_device = 'device'
-    mace_value_str = 'value'
+    mace_scalar_input_str = 'scalar_input'
    mace_wino_block_size = 'wino_block_size'
    mace_output_shape_str = 'output_shape'
    mace_begin_mask_str = 'begin_mask'
@@ -167,6 +168,8 @@ class MaceKeyword(object):
    mace_offset_str = 'offset'
    mace_from_caffe_str = 'from_caffe'
    mace_opencl_max_image_size = "opencl_max_image_size"
+    mace_seperate_buffer_str = 'seperate_buffer'
+    mace_scalar_input_index_str = 'scalar_input_index'


 class TransformerRule(Enum):

--- a/mace/python/tools/converter_tool/tensorflow_converter.py
+++ b/mace/python/tools/converter_tool/tensorflow_converter.py
@@ -401,13 +401,24 @@ class TensorflowConverter(base_converter.ConverterInterface):
        type_arg.name = MaceKeyword.mace_element_type_str
        type_arg.i = self.eltwise_type[tf_op.type].value

+        def check_is_scalar(tf_op):
+            if len(tf_op.inputs) == 1:
+                return len(tf_op.inputs[0].shape) == 0
+            elif len(tf_op.inputs) == 2:
+                return len(tf_op.inputs[0].shape) == 0 and\
+                       len(tf_op.inputs[1].shape) == 0
+
+        if check_is_scalar(tf_op):
+            op.type = MaceOp.ScalarMath.name
+        else:
+            op.type = MaceOp.Eltwise.name
        if tf_op.type == TFOpType.Square:
            value_arg = op.arg.add()
-            value_arg.name = MaceKeyword.mace_value_str
+            value_arg.name = MaceKeyword.mace_scalar_input_str
            value_arg.f = 2.0
        elif tf_op.type == TFOpType.Rsqrt:
            value_arg = op.arg.add()
-            value_arg.name = MaceKeyword.mace_value_str
+            value_arg.name = MaceKeyword.mace_scalar_input_str
            value_arg.f = -0.5

        if type_arg.i != EltwiseType.NEG.value \
@@ -418,19 +429,31 @@ class TensorflowConverter(base_converter.ConverterInterface):
                        EltwiseType.SUM, EltwiseType.PROD,
                        EltwiseType.MAX, EltwiseType.MIN]

-                if len(tf_op.inputs) > 1 and len(tf_op.inputs[1].shape) == 0:
+                if len(tf_op.inputs) > 1 and\
+                        len(tf_op.inputs[1].shape) == 0 and\
+                        tf_op.inputs[1].op.type == TFOpType.Const.name:
                    scalar = tf_op.inputs[1].eval().astype(np.float32)
                    value_arg = op.arg.add()
-                    value_arg.name = MaceKeyword.mace_value_str
+                    value_arg.name = MaceKeyword.mace_scalar_input_str
                    value_arg.f = scalar
                    self._skip_tensor.add(tf_op.inputs[1].name)
+                    value_index_arg = op.arg.add()
+                    value_index_arg.name =\
+                        MaceKeyword.mace_scalar_input_index_str
+                    value_index_arg.i = 1
+                    self._skip_tensor.add(tf_op.inputs[1].name)
                    del op.input[1]
-                elif len(tf_op.inputs[0].shape) == 0 and \
+                elif len(tf_op.inputs[0].shape) == 0 and\
+                        tf_op.inputs[0].op.type == TFOpType.Const.name and\
                        is_commutative(type_arg.i):
                    scalar = tf_op.inputs[0].eval().astype(np.float32)
                    value_arg = op.arg.add()
-                    value_arg.name = MaceKeyword.mace_value_str
+                    value_arg.name = MaceKeyword.mace_scalar_input_str
                    value_arg.f = scalar
+                    value_index_arg = op.arg.add()
+                    value_index_arg.name =\
+                        MaceKeyword.mace_scalar_input_index_str
+                    value_index_arg.i = 0
                    self._skip_tensor.add(tf_op.inputs[0].name)
                    del op.input[0]
            except tf.errors.InvalidArgumentError:
@@ -771,7 +794,6 @@ class TensorflowConverter(base_converter.ConverterInterface):
    def convert_split(self, tf_op):
        axis = tf_op.inputs[0].eval().astype(np.int32)
        axis = len(op.output_shape[0].dims) + axis if axis < 0 else axis
-        input_shape = self.infer_tensor_shape(tf_op.inputs[1])
        op = self.convert_general_op(tf_op)
        op.type = MaceOp.Split.name
        del op.input[0]

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -117,7 +117,6 @@ class Transformer(base_converter.ConverterInterface):
                changed = transformer()
                if not changed:
                        break
-
        return self._model

    def filter_format(self):

--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -228,16 +228,24 @@ class GPUMemoryOptimizer(MemoryOptimizer):
                mace_pb2.GPU_IMAGE,
                calculate_image_shape(OpenCLBufferType.IN_OUT_HEIGHT,
                                      buffer_shape))
-        elif op_type == 'Shape':
-            mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
-                                    [output_shape[0], 1])
+        elif op_type in ['Shape', 'StridedSlice', 'Stack', 'ScalarMath']:
+            if len(output_shape) == 1:
+                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
+                                        [output_shape[0], 1])
+            elif len(output_shape) == 0:
+                mem_block = MemoryBlock(mace_pb2.CPU_BUFFER,
+                                        [1, 1])
+            else:
+                raise Exception('%s output shape dim size is not 0 or 1.' %
+                                op_type)
        else:
            if len(output_shape) == 2:  # only support fc/softmax
                buffer_shape = [output_shape[0], 1, 1, output_shape[1]]
            elif len(output_shape) == 4:
                buffer_shape = output_shape
            else:
-                raise Exception('output shape dim size is not 2 or 4.')
+                raise Exception('%s output shape dim size is not 2 or 4.' %
+                                op_type)
            mem_block = MemoryBlock(
                mace_pb2.GPU_IMAGE,
                calculate_image_shape(OpenCLBufferType.IN_OUT_CHANNEL,