Merge remote-tracking branch 'upstream/develop' into develop

78fd1e66 · eclipsess · 79e3c2e4 · 68929bbc · 78fd1e66 · 78fd1e66
39 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
 project(paddle-mobile)
-option(DEBUGING "enable debug mode" OFF)
+option(DEBUGING "enable debug mode" ON)
 option(USE_OPENMP "openmp support" OFF)
-option(USE_EXCEPTION "use std exception" OFF)
+option(USE_EXCEPTION "use std exception" ON)
+option(LOG_PROFILE "log profile" ON)
+# select the platform to build
+option(CPU "cpu" ON)
+option(MALI_GPU "mali gpu" OFF)
+option(FPGA "fpga" OFF)
+if (CPU)
+    add_definitions(-DPADDLE_MOBILE_CPU)
+elseif (MALI_GPU)
+    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
+elseif(FPGA)
+    add_definitions(-DPADDLE_MOBILE_FPGA)
+endif()
+set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
 if (DEBUGING)
    set(CMAKE_BUILD_TYPE Debug)
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS}")
 else()
    set(CMAKE_BUILD_TYPE Release)
 endif ()
@@ -24,12 +39,17 @@ else()
 endif()
 if (USE_EXCEPTION)
+    message(STATUS "use exception")
    add_definitions(-DENABLE_EXCEPTION)
    add_definitions(-fexceptions)
 else()
    add_definitions(-fno-exceptions)
 endif ()
+if (LOG_PROFILE)
+    add_definitions(-DPADDLE_MOBILE_PROFILE)
+endif()
 if(IS_MAC)
    add_definitions(-DX86)
 elseif(IS_IOS)
@@ -42,7 +62,6 @@ else ()
    add_definitions(-DX86)
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
@@ -74,6 +93,7 @@ if (googlenet)
    add_definitions(-DFUSION_FC_OP)
    add_definitions(-DPOOL_OP)
    add_definitions(-DRELU_OP)
+    add_definitions(-DFUSION_CONVADD_OP)
 elseif (mobilenet)
    add_definitions(-DCONV_OP)
    add_definitions(-DELEMENTWISEADD_OP)
@@ -112,7 +132,7 @@ else ()
    add_definitions(-DCONV_OP)
    add_definitions(-DDEPTHWISECONV_OP)
    add_definitions(-DELEMENTWISEADD_OP)
-    add_definitions(-DFUSIONCONVADD_OP)
+    add_definitions(-DFUSION_CONVADD_OP)
    add_definitions(-DCONVADDRELU_OP)
    add_definitions(-DFUSION_FC_OP)
    add_definitions(-DLRN_OP)
@@ -127,8 +147,13 @@ else ()
    add_definitions(-DTRANSPOSE_OP)
 endif()
+if (IS_IOS)
-add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+    add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+elseif(ANDROID)
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+else()
+    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+endif ()
 if(DEBUGING)
    add_subdirectory(test)

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -99,6 +99,7 @@ static std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key = {
        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_CONV_ADD, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -96,24 +96,39 @@ class OpRegistry {
  }
 };
-#define REGISTER_OPERATOR(op_type, op_class)                                \
+#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
  template <typename Dtype, typename T>                                    \
-  class _OpClass_##op_type##_ : public op_class<Dtype, T> {                 \
+  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
   public:                                                                 \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                 \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);   \
  };                                                                       \
  static paddle_mobile::framework::OperatorRegistrar<                      \
-      paddle_mobile::CPU, _OpClass_##op_type##_<paddle_mobile::CPU, float>> \
+      device_type, _OpClass_##op_type##_##device_name<device_type, float>> \
-      __op_registrar_##op_type##__(#op_type);                               \
+      __op_registrar_##op_type##_##device_name(#op_type);                  \
-  int TouchOpRegistrar_##op_type() {                                        \
+  int TouchOpRegistrar_##op_type##_##device_name() {                       \
-    __op_registrar_##op_type##__.Touch();                                   \
+    __op_registrar_##op_type##_##device_name.Touch();                      \
    return 0;                                                              \
  }
-#define USE_OP(op_type)                                           \
+#define REGISTER_OPERATOR_CPU(op_type, op_class) \
-  extern int TouchOpRegistrar_##op_type();                        \
+  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type()
+#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
+#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
+  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
+#define USE_OP(op_type, device_name)                                           \
+  extern int TouchOpRegistrar_##op_type##_##device_name();                     \
+  static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##device_name()
+#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
+#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
+#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -58,7 +58,12 @@ void OperatorBase<Dtype>::Run() const {
 }
 template class OperatorBase<CPU>;
+template class OperatorBase<FPGA>;
+template class OperatorBase<GPU_MALI>;
 template class OperatorWithKernel<CPU>;
+template class OperatorWithKernel<FPGA>;
+template class OperatorWithKernel<GPU_MALI>;
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -153,6 +153,7 @@ class FusionOpMatcher {
  std::string BeginType() { return node_.Type(); }
+  //  virtual  bool Fusion();
 protected:
  Node node_;
  std::string type_;

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -131,7 +131,6 @@ class Tensor {
    }
    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor'snumel must >=0.")
    int64_t size = numel() * SizeOfType(type);
-    /* some versions of boost::variant don't have operator!= */
    if (holder_ == nullptr || holder_->size() < size + offset_) {
      holder_.reset(new PlaceholderImpl(size, type));
      offset_ = 0;

--- a/src/io/io.cpp
+++ b/src/io/io.cpp
@@ -14,6 +14,10 @@ limitations under the License. */
 #include "io.h"
 #include <vector>
+#ifdef PADDLE_MOBILE_PROFILE
+#include <ctime>
+#include <map>
+#endif
 #include "common/enforce.h"
 #include "common/log.h"
@@ -336,10 +340,34 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
+#ifdef PADDLE_MOBILE_PROFILE
+  std::map<std::string, clock_t> _profile;
+#endif
  for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
    auto op = ops_of_block_[*to_predict_block.get()][j];
+#ifdef PADDLE_MOBILE_PROFILE
+    _profile[op->Type()] = clock();
+#endif
    op->Run();
+#ifdef PADDLE_MOBILE_PROFILE
+    _profile[op->Type()] = clock() - _profile[op->Type()];
+#endif
  }
+#ifdef PADDLE_MOBILE_PROFILE
+  {
+    DLOG << "========================[ profile ]==========================";
+    clock_t _ptotal = 0;
+    for (auto const &p : _profile) {
+      _ptotal += p.second;
+    }
+    for (auto const &p : _profile) {
+      DLOG << p.first << std::string(16 - p.first.size(), ' ') << "\t"
+           << (float)p.second << "\t\t"
+           << (float)p.second / (float)_ptotal * 100.0;
+    }
+    DLOG << "========================[         ]==========================";
+  }
+#endif
  auto ops = ops_of_block_[*to_predict_program_->Block(0)];
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -31,7 +31,13 @@ template class BatchNormOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(batch_norm);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(batch_norm, ops::BatchNormOp);
+USE_OP_CPU(batch_norm);
+REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -52,7 +52,13 @@ template class BoxCoderOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(box_coder);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(box_coder, ops::BoxCoderOp);
+USE_OP_CPU(box_coder);
+REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -62,7 +62,13 @@ template class ConcatOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(concat);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(concat, ops::ConcatOp);
+USE_OP_CPU(concat);
+REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -53,7 +53,17 @@ template class ConvOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(conv2d);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(conv2d, ops::ConvOp);
+USE_OP_CPU(conv2d);
+REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+USE_OP_MALI_GPU(conv2d);
+REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(conv2d);
+REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
+#endif
 #endif
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -54,7 +54,13 @@ template class DepthwiseConvOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(depthwise_conv2d);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(depthwise_conv2d, ops::DepthwiseConvOp);
+USE_OP_CPU(depthwise_conv2d);
+REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -29,7 +29,13 @@ template class ElementwiseAddOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(elementwise_add);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(elementwise_add, ops::ElementwiseAddOp);
+USE_OP_CPU(elementwise_add);
+REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -43,8 +43,14 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
 };
 namespace ops = paddle_mobile::operators;
-USE_OP(feed);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(feed, ops::FeedOp);
+USE_OP_CPU(feed);
+REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -43,8 +43,14 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
 };
 namespace ops = paddle_mobile::operators;
-USE_OP(fetch);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(fetch, ops::FetchOp);
+USE_OP_CPU(fetch);
+REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -12,20 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSIONCONVADD_OP
+#ifdef FUSION_CONVADD_OP
 #include "operators/fusion_conv_add.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename Dtype, typename T>
-void FushionConvAddOp<Dtype, T>::InferShape() const {}
+void FushionConvAddOp<Dtype, T>::InferShape() const {
+  auto in_dims = param_.Input()->dims();
+  auto filter_dims = param_.Filter()->dims();
+  const std::vector<int> &strides = param_.Strides();
+  std::vector<int> paddings = param_.Paddings();
+  int groups = param_.Groups();
+  std::vector<int> dilations = param_.Dilations();
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  param_.Output()->Resize(ddim);
+}
 template class FushionConvAddOp<CPU, float>;
 }  // namespace operators
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(conv_add);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(conv_add, ops::FushionConvAddOp);
+USE_OP_CPU(conv_add);
+REGISTER_OPERATOR_CPU(conv_add, ops::FushionConvAddOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define FUSION_CONVADD_OP
-#ifdef FUSIONCONVADD_OP
+#ifdef FUSION_CONVADD_OP
 #pragma once
 #include <string>
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
+#include "op_param.h"
+#include "operators/kernel/conv_add_kernel.h"
 namespace paddle_mobile {
 namespace operators {
@@ -53,18 +54,37 @@ class FushionConvAddOp : public framework::OperatorWithKernel<DeviceType> {
                   const framework::AttributeMap &attrs,
                   std::shared_ptr<framework::Scope> scope)
      : framework::OperatorWithKernel<DeviceType>(type, inputs, outputs, attrs,
-                                                  scope) {}
+                                                  scope),
+        param_(inputs, outputs, attrs, *scope) {}
-  void RunImpl() const {}
+  void RunImpl() const {
+    operators::ConvAddKernel<DeviceType, T> kernel;
+    kernel.Compute(param_);
+    this->ClearVariables({"Filter", "Input", "Y"});
+  }
  using framework::OperatorWithKernel<DeviceType>::OperatorWithKernel;
  void InferShape() const override;
 protected:
-  //  FushionFcParam param_;
+  FushionConvAddParam param_;
 };
-// static framework::FusionOpRegistrar fc_registrar(new FusionConvAddMatcher());
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+#ifdef PADDLE_MOBILE_CPU
+static framework::FusionOpRegistrar convadd_registrar(
+    new FusionConvAddMatcher());
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -46,8 +46,14 @@ class ConvAddReluOp {
 private:
 };
-// static framework::FusionOpRegistrar fc_registrar(
+#ifdef PADDLE_MOBILE_CPU
+// static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
 //        new FushionConvAddReluOpMatcher());
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -54,7 +54,13 @@ template class FushionFcOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(fc);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(fc, ops::FushionFcOp);
+USE_OP_CPU(fc);
+REGISTER_OPERATOR_CPU(fc, ops::FushionFcOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -37,8 +37,6 @@ class FusionFcMatcher : public framework::FusionOpMatcher {
  void FolderNodes(
      framework::Node *node,
      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    vector<std::shared_ptr<framework::OpDesc>> origin_descs =
-        node->OpDescs(node_.Depth());
    node->Folder(node_.Depth(), Type(),
                 {{G_OP_TYPE_ELEMENTWISE_ADD, {"Y", "Z"}}}, removed_nodes);
  }
@@ -69,7 +67,14 @@ class FushionFcOp : public framework::OperatorWithKernel<DeviceType> {
  FushionFcParam param_;
 };
-// static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#ifdef PADDLE_MOBILE_CPU
+static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#include "operators/kernel/conv_add_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+void expand_bias(Tensor &bias, int axis, const DDim &dDim) {
+  auto bias_ptr = bias.data<float>();
+  const DDim bias_ddim = bias.dims();
+  PADDLE_MOBILE_ENFORCE(bias.dims().size() == 1,
+                        "the bias tensor's dims size != 1")
+  DDim outer_ddim = paddle_mobile::framework::slice_ddim(dDim, 0, axis + 1);
+  DDim inner_ddim =
+      paddle_mobile::framework::slice_ddim(dDim, axis + 1, dDim.size());
+  int outer_size = paddle_mobile::framework::product(outer_ddim);
+  int inner_size = paddle_mobile::framework::product(inner_ddim);
+  bias.Resize(dDim);
+  auto new_ptr = bias.mutable_data<float>();
+  int axis_size = dDim[axis];
+  for (int i = 0; i < outer_size; ++i) {
+    float v_bias = bias_ptr[i * axis_size / outer_size];
+    for (int j = 0; j < inner_size; ++j) {
+      new_ptr[i * inner_size + j] = v_bias;
+    }
+  }
+}
+template <>
+void ConvAddKernel<CPU, float>::Compute(
+    const FushionConvAddParam &param) const {
+  DLOG << param;
+  const Tensor *input = param.Input();
+  Tensor filter = *param.Filter();
+  Tensor bias = *param.Bias();
+  int axis = param.Axis();
+  Tensor *output = param.Output();
+  expand_bias(bias, axis, output->dims());
+  output->ShareDataWith(bias);
+  int groups = param.Groups();
+  std::vector<int> strides = param.Strides();
+  std::vector<int> paddings = param.Paddings();
+  std::vector<int> dilations = param.Dilations();
+  const int batch_size = static_cast<int>(input->dims()[0]);
+  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+  size_t data_dim = filter_shape_vec.size() - 2;
+  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+  col_shape_vec[0] = input->dims()[1] / groups;
+  for (size_t j = 0; j < data_dim; ++j) {
+    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+  }
+  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+  framework::DDim col_matrix_shape =
+      framework::flatten_to_2d(col_shape, data_dim + 1);
+  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+  Tensor col;
+  Tensor col_matrix;
+  if (is_expand) {
+    col.mutable_data<float>(col_shape);
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+  }
+  framework::DDim input_shape = framework::slice_ddim(
+      input->dims(), 1, static_cast<int>(input->dims().size()));
+  framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                         filter.numel() / filter.dims()[0]};
+  filter.Resize(filter_matrix_shape);
+  framework::DDim output_matrix_shape = {
+      output->dims()[1],
+      output->numel() / (output->dims()[0] * output->dims()[1])};
+  // convolution operator: im2col(or vol2col) + gemm
+  int in_step = static_cast<int>(input->dims()[1]) / groups;
+  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  math::Vol2ColFunctor<CPU, float> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  for (int i = 0; i < batch_size; i++) {
+    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+    for (int g = 0; g < groups; g++) {
+      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+      if (!is_expand) {
+        col.ShareDataWith(in_slice);
+        col_matrix.ShareDataWith(col);
+        col_matrix.Resize(col_matrix_shape);
+      } else if (data_dim == 2U) {
+        // im2col
+        im2col(in_slice, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &col);
+      } else if (data_dim == 3U) {
+        // vol2col
+        vol2col(in_slice, dilations, strides, paddings, &col);
+      }
+      // gemm
+      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+      math::matmul<float>(filter_slice, false, col_matrix, false,
+                          static_cast<float>(1), &out_slice,
+                          static_cast<float>(1));
+    }
+  }
+}
+template class ConvAddKernel<CPU, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/relu_kernel.cpp
+++ b/src/operators/kernel/arm/relu_kernel.cpp
@@ -37,13 +37,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
  auto *out = param.Out();
  auto *out_ptr = out->mutable_data<float>();
+  int numel = input_x->numel();
+  if (numel > 32) {
+    asm volatile(
+        "pld        [%[input_x_ptr], #0]        \n\t"
+        "vmov.f32   q8,    #0.0                 \n\t"
+        "subs %[num], %[num], #32                \n\t"
+        "blt        end_num_%=                  \n\t"
+        "loop_num_%=:                           \n\t"
+        "pld        [%[input_x_ptr], #1024]      \n\t"
+        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+        "vmax.f32 q0, q0, q8                   \n\t"
+        "vmax.f32 q1, q1, q8                    \n\t"
+        "vmax.f32 q2, q2, q8                   \n\t"
+        "vmax.f32 q3, q3, q8                   \n\t"
+        "vmax.f32 q4, q4, q8                   \n\t"
+        "vmax.f32 q5, q5, q8                   \n\t"
+        "vmax.f32 q6, q6, q8                   \n\t"
+        "vmax.f32 q7, q7, q8                   \n\t"
+        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+        "subs %[num], %[num], #32              \n\t"
+        "bge        loop_num_%=                \n\t"
+        "end_num_%=:                           \n\t"
+        "cmp %[num], #0                         \n\t"
+        "bge   end_%=                          \n\t"
+        "mov r6, #4                             \n\t"
+        "mul r5, %[num], r6                     \n\t"
+        "add %[input_x_ptr], %[input_x_ptr], r5     \n\t"
+        "vld1.32 {q0, q1}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q2, q3}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q4, q5}, [%[input_x_ptr]]!    \n\t"
+        "vld1.32 {q6, q7}, [%[input_x_ptr]]!    \n\t"
+        "vmax.f32 q0, q0, q8                   \n\t"
+        "vmax.f32 q1, q1, q8                    \n\t"
+        "vmax.f32 q2, q2, q8                   \n\t"
+        "vmax.f32 q3, q3, q8                   \n\t"
+        "vmax.f32 q4, q4, q8                   \n\t"
+        "vmax.f32 q5, q5, q8                   \n\t"
+        "vmax.f32 q6, q6, q8                   \n\t"
+        "vmax.f32 q7, q7, q8                   \n\t"
+        "add %[out_ptr], %[out_ptr], r5       \n\t"
+        "vst1.32 {q0, q1}, [%[out_ptr]]!        \n\t"
+        "vst1.32 {q2, q3}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q4, q5}, [%[out_ptr]]!       \n\t"
+        "vst1.32 {q6, q7}, [%[out_ptr]]!       \n\t"
+        "end_%=:                                \n\t"
+        :
+        :
+        [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
+        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
+          "r6");
+  } else {
    ReluFunctor<float> func_;
    math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
+    trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
+  }
-  //  for (int i = 0; i < input_x->numel(); i++) {
-  //    out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
-  //  }
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+#pragma once
+#include <vector>
+#include "framework/ddim.h"
+#include "framework/operator.h"
+#include "operators/math/im2col.h"
+#include "operators/math/math_function.h"
+#include "operators/math/vol2col.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+using framework::DDim;
+using framework::OpKernelBase;
+template <typename DeviceType, typename T>
+class ConvAddKernel : public OpKernelBase<DeviceType, FushionConvAddParam> {
+ public:
+  void Compute(const FushionConvAddParam &param) const;
+};
+inline bool IsExpand(const std::vector<int64_t> &filter_dim,
+                     const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const std::vector<int> &dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -14,15 +14,16 @@ limitations under the License. */
 #ifdef CONV_OP
+#include "operators/kernel/conv_kernel.h"
 namespace paddle_mobile {
 namespace operators {
-// template<>
+template <>
-// void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const
+void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {}
-// {}
+template class ConvKernel<FPGA, float>;
-//
-// template class ConvKernel<FPGA, float>;
+}  // namespace operators
-}
 }  // namespace paddle_mobile
 #endif
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -29,7 +29,13 @@ template class LrnOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(lrn);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(lrn, ops::LrnOp);
+USE_OP_CPU(lrn);
+REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -55,7 +55,13 @@ template class MulOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(mul);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(mul, ops::MulOp);
+USE_OP_CPU(mul);
+REGISTER_OPERATOR_CPU(mul, ops::MulOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -39,7 +39,13 @@ template class MultiClassNMSOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(multiclass_nms);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp);
+USE_OP_CPU(multiclass_nms);
+REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/op_param.cpp
+++ b/src/operators/op_param.cpp
@@ -39,5 +39,31 @@ Print &operator<<(Print &printer, const ConvParam &conv_param) {
 }
 #endif
+#ifdef FUSION_CONVADD_OP
+Print &operator<<(Print &printer, const FushionConvAddParam &conv_param) {
+  printer << "parameter of conv_add: "
+          << "\n";
+  printer << "  stride: "
+          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
+          << "\n";
+  printer << "  paddings: "
+          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
+          << ") "
+          << "\n";
+  printer << "  dilations: "
+          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
+          << ") "
+          << "\n";
+  printer << "  groups: " << conv_param.Groups() << "\n";
+  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
+  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
+  printer << "  bias dims: " << conv_param.Bias()->dims() << "\n";
+  printer << "  output dims: " << conv_param.Output()->dims();
+  return printer;
+}
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -165,6 +165,8 @@ class OpParam {
  template <typename T>
  static T *GetVarValue(const string &key, const VariableNameMap &var_map,
                        const Scope &scope) {
+    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
+                          "%s is not contained in var_map", key.c_str())
    auto var_vec = var_map.at(key);
    if (!var_vec.empty()) {
      auto var = scope.FindVar(var_vec[0]);
@@ -787,5 +789,54 @@ class FushionFcParam : public OpParam {
 };
 #endif
+#ifdef FUSION_CONVADD_OP
+class FushionConvAddParam : public OpParam {
+ public:
+  FushionConvAddParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    bias_ = InputYFrom<LoDTensor>(inputs, scope);
+    axis_ = GetAttr<int>("axis", attrs);
+    filter_ = FilterFrom<LoDTensor>(inputs, scope);
+    input_ = InputFrom<LoDTensor>(inputs, scope);
+    output_ = OutFrom<LoDTensor>(outputs, scope);
+    strides_ = GetAttr<vector<int>>("strides", attrs);
+    paddings_ = GetAttr<vector<int>>("paddings", attrs);
+    dilations_ = GetAttr<vector<int>>("dilations", attrs);
+    groups = GetAttr<int>("groups", attrs);
+  }
+  Tensor *Bias() const { return bias_; }
+  const int &Axis() const { return axis_; }
+  const Tensor *Input() const { return input_; }
+  const Tensor *Filter() const { return filter_; }
+  Tensor *Output() const { return output_; }
+  const vector<int> &Strides() const { return strides_; }
+  const vector<int> &Paddings() const { return paddings_; }
+  const vector<int> &Dilations() const { return dilations_; }
+  const int &Groups() const { return groups; }
+ private:
+  Tensor *bias_;
+  int axis_;
+  Tensor *input_;
+  Tensor *output_;
+  Tensor *filter_;
+  vector<int> strides_;
+  vector<int> paddings_;
+  vector<int> dilations_;
+  int groups;
+};
+Print &operator<<(Print &printer, const FushionConvAddParam &conv_param);
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
@@ -59,7 +59,13 @@ template class PoolOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(pool2d);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(pool2d, ops::PoolOp);
+USE_OP_CPU(pool2d);
+REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -49,7 +49,13 @@ template class PriorBoxOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(prior_box);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(prior_box, ops::PriorBoxOp);
+USE_OP_CPU(prior_box);
+REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -33,7 +33,13 @@ template class ReluOp<CPU, float>;
 * 都是需要和model中类型对应起来的
 * */
 namespace ops = paddle_mobile::operators;
-USE_OP(relu);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(relu, ops::ReluOp);
+USE_OP_CPU(relu);
+REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -32,7 +32,13 @@ template class ReshapeOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(reshape);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(reshape, ops::ReshapeOp);
+USE_OP_CPU(reshape);
+REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -27,7 +27,13 @@ template class SigmoidOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(sigmoid);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(sigmoid, ops::SigmoidOp);
+USE_OP_CPU(sigmoid);
+REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -27,7 +27,13 @@ template class SoftmaxOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(softmax);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(softmax, ops::SoftmaxOp);
+USE_OP_CPU(softmax);
+REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -52,7 +52,13 @@ template class TransposeOp<CPU, float>;
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
-USE_OP(transpose);
+#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR(transpose, ops::TransposeOp);
+USE_OP_CPU(transpose);
+REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#endif
 #endif
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -20,9 +20,9 @@ int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  bool optimize = false;
  auto time1 = time();
-  //  auto program = loader.Load(g_googlenet, optimize);
+  auto program = loader.Load(g_googlenet, optimize);
-  auto program = loader.Load(g_googlenet_combine + "/model",
+  //  auto program = loader.Load(g_googlenet_combine + "/model",
-                             g_googlenet_combine + "/params", optimize);
+  //                             g_googlenet_combine + "/params", optimize);
  auto time2 = time();
  DLOG << "load cost :" << time_diff(time1, time2) << "ms\n";
  paddle_mobile::Executor<paddle_mobile::CPU> executor(program, 1, optimize);

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -15,7 +15,6 @@ build_for_mac() {
    fi
    PLATFORM="x86"
    MODE="Release"
-    CXX_FLAGS="-std=c++11 -O3 -s"
    BUILD_DIR=../build/release/"${PLATFORM}"
    mkdir -p ${BUILD_DIR}/build
@@ -25,7 +24,6 @@ build_for_mac() {
    cmake .. \
        -B"${BUILD_DIR}" \
    	-DCMAKE_BUILD_TYPE="${MODE}" \
-    	-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
    	-DIS_MAC=true
    cd ${BUILD_DIR}
@@ -46,11 +44,11 @@ build_for_android() {
    if [ "${PLATFORM}" = "arm-v7a" ]; then
        ABI="armeabi-v7a with NEON"
        ARM_PLATFORM="V7"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
+        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
    elif [ "${PLATFORM}" = "arm-v8a" ]; then
        ABI="arm64-v8a"
        ARM_PLATFORM="V8"
-        CXX_FLAGS="-O3 -std=c++11 -s -march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
+        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog"
    else
        echo "unknown platform!"
        exit -1
@@ -98,7 +96,7 @@ build_for_ios() {
    BUILD_DIR=../build/release/"${PLATFORM}"
    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
    C_FLAGS="-fobjc-abi-version=2 -fobjc-arc -isysroot ${CMAKE_OSX_SYSROOT}"
-    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
+    CXX_FLAGS="-fobjc-abi-version=2 -fobjc-arc -std=gnu++14 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT}"
    mkdir -p "${BUILD_DIR}"
    if [ $# -eq 1 ]; then
        NET=$1

--- a/tools/ios-cmake/ios.toolchain.cmake
+++ b/tools/ios-cmake/ios.toolchain.cmake