Merge remote-tracking branch 'upstream/develop' into develop

397bb881 · qnqinan · 8679842b · e79949e9 · 8679842b · 397bb881
344 changed file
--- a/.gitmodules
+++ b/.gitmodules
-[submodule "src/operators/kernel/mali/ACL_Android"]
-	path = src/operators/kernel/mali/ACL_Android
-	url = https://github.com/halsay/ACL_Android.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,6 @@ option(WITH_TEST     "build with unit tests" ON)

 # select the platform to build
 option(CPU        "build with arm CPU support" ON)
-option(GPU_MALI   "build with arm mali GPU support" OFF)
 option(GPU_CL     "build with OpenCL support" OFF)
 option(FPGA       "build with FPGA support" OFF)
 if(FPGA)
@@ -23,7 +22,7 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)

-set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}")
+set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS} -Wno-attributes")
 if(IS_IOS)
    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
        -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
@@ -97,31 +96,6 @@ else()
    endforeach()
 endif()

-if (GPU_MALI)
-    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
-    add_definitions(-DUSE_ACL=1)
-    add_definitions(-DUSE_OPENCL)
-    set(ACL_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/kernel/mali/ACL_Android)
-    include_directories(${ACL_ROOT} ${ACL_ROOT}/include)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_core")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -larm_compute_graph")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -L${ACL_ROOT}/build/opencl-1.2-stubs")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lOpenCL")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_ACL=1")
-else()
-    file(GLOB_RECURSE _tmp_list src/operators/kernel/mali/*.cpp src/operators/kernel/mali/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/mali/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
 if(FPGA)
    add_definitions(-DPADDLE_MOBILE_FPGA)
    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc)
@@ -213,7 +187,7 @@ else()
    set(NET "default" CACHE STRING "select net type")
 endif()

-set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP")
+set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP" "op")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")

 # build library

--- a/doc/build.md
+++ b/doc/build.md
@@ -46,7 +46,6 @@ root@5affd29d4fc5:/ # ccmake .
 DEBUGING                         ON
 FPGA                             OFF
 LOG_PROFILE                      ON
- MALI_GPU                         OFF
 NET                              googlenet
 USE_EXCEPTION                    ON
 USE_OPENMP                       OFF

--- a/doc/design_doc.md
+++ b/doc/design_doc.md
@@ -109,11 +109,6 @@ USE_OP_CPU(conv2d);
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif

-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
-REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
-#endif
-
 #ifdef PADDLE_MOBILE_FPGA
 USE_OP_FPGA(conv2d);
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);

--- a/src/common/log.h
+++ b/src/common/log.h
@@ -31,7 +31,8 @@ namespace paddle_mobile {

 #ifdef ANDROID

-extern const char *ANDROID_LOG_TAG;
+static const char *ANDROID_LOG_TAG =
+    "paddle_mobile LOG built on " __DATE__ " " __TIME__;

 #define ANDROIDLOGI(...)                                               \
  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \

--- a/src/common/type_define.h
+++ b/src/common/type_define.h
@@ -37,8 +37,7 @@ template <typename Dtype>
 using OpCreator = std::function<framework::OperatorBase<Dtype> *(
    const std::string & /*type*/, const VariableNameMap & /*inputs*/,
    const VariableNameMap & /*outputs*/,
-    const framework::AttributeMap & /*attrs*/,
-    std::shared_ptr<framework::Scope> /*scope*/)>;
+    const framework::AttributeMap & /*attrs*/, framework::Scope * /*scope*/)>;

 using InferVarTypeFN = std::function<void(const framework::OpDesc & /*op_desc*/,
                                          framework::BlockDesc * /*block*/)>;

--- a/src/common/types.cpp
+++ b/src/common/types.cpp
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -205,6 +205,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;

+extern const char *G_OP_TYPE_PAD2D;
+
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key;

--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -70,10 +70,11 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
  ofm_tensor->set_type(typeid(half));
  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
 }

 void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
@@ -89,10 +90,11 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
  ofm_tensor->set_type(typeid(half));
  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
+  fpga::fpga_flush(p, memory_size);
 }

 void format_fp32_ofm(framework::Tensor *ofm_tensor) {
@@ -108,10 +110,11 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
    DLOG << "Wrong ofm dimension";
  }
  auto p = fpga_malloc(memory_size);
-  memset(p, 0, memory_size);
+  // memset(p, 0, memory_size);
  ofm_tensor->reset_data_ptr(p);
  ofm_tensor->set_type(typeid(float));
  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
+  fpga::fpga_flush(p, memory_size);
 }

 float filter_find_max(framework::Tensor *filter_tensor) {
@@ -463,9 +466,24 @@ void expand_EW_arg(EWAddArgs *arg) {
  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
                 IMAGE_ALIGNMENT);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
-                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;
+  //////////////////////////////////////////////////////////
+  // temporary modify for EW and DMA problem
+  uint64_t image_image_pixel = 0;
+  if ((args.image0.width * args.image0.channels) >= 24576) {
+    if ((args.image0.width * args.image0.channels) % 32 != 0) {
+      DLOG << "EW parameter can not be support";
+    } else {
+      image_amount_per_row = image_amount_per_row / 2;
+      image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                          ((uint64_t)(args.image0.width / 2) << 16) |
+                          (uint64_t)(args.image0.height * 2);
+    }
+  } else {
+    image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                        ((uint64_t)args.image0.width << 16) |
+                        (uint64_t)args.image0.height;
+  }
+  //////////////////////////////////////////////////////////

  (*arg).driver.image0_address_phy = image0_address_phy;
  (*arg).driver.image1_address_phy = image1_address_phy;
@@ -560,6 +578,18 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
+    // for test
+    //    {
+    //    static int cnt = 0;
+    //    if(cnt == 4){
+    //        int8_t result = 0;
+    //        std::string str = "fc_filter";
+    //      fpga::savefile<int8_t>(str, arg->conv_arg[i].filter_address,
+    //      filter_size, result);
+    //
+    //    }
+    //    cnt++;
+    //}

    size_t bs_size = 2 *
                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
@@ -570,6 +600,18 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
+    // for test
+    /*{
+    static int cnt = 0;
+    if(cnt == 4){
+        float result = 0;
+        std::string str = "fc_bs";
+      fpga::savefile<float>(str, arg->conv_arg[i].sb_address, bs_size/4,
+result);
+
+    }
+    cnt++;
+}*/

    if (n > 1) {
      arg->conv_arg[i].output.scale_address =

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -268,6 +268,7 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  quantize(data_in, data_size, max);
  char **quantize_data = (char **)data_in;  // NOLINT
  convert_fc_filter(quantize_data, num, chw);
+  convert_to_hwc(quantize_data, num, channel, height, width);
  align_element(quantize_data, num, chw);
  if (num_after_alignment != num) {
    align_num(quantize_data, num_per_div_before_alignment, num, chw);

--- a/src/framework/attribute.h
+++ b/src/framework/attribute.h
@@ -91,7 +91,14 @@ class Attribute {
        break;
      }
      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK: {
-        attr.Set<int>(attr_desc->block_idx);
+        break;
+      }
+      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS: {
+        vector<int> val(attr_desc->n_longs);
+        for (int i = 0; i < attr_desc->n_longs; ++i) {
+          val[i] = attr_desc->longs[i];
+        }
+        attr.Set<vector<int>>(val);
        break;
      }
      default:
@@ -139,6 +146,14 @@ class Attribute {
      return vistor(attr.variant_.Get<vector<bool>>());
    } else if (attr.variant_.TypeId() == typeid(int64_t).hash_code()) {
      return vistor(attr.variant_.Get<int64_t>());
+    } else if (attr.variant_.TypeId() ==
+               typeid(framework::BlockDesc *).hash_code()) {
+      return vistor(attr.variant_.Get<framework::BlockDesc *>());
+    } else if (attr.variant_.TypeId() ==
+               typeid(vector<framework::BlockDesc *>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<framework::BlockDesc *>>());
+    } else if (attr.variant_.TypeId() == typeid(vector<int64_t>).hash_code()) {
+      return vistor(attr.variant_.Get<vector<int64_t>>());
    } else {
      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
    }
@@ -146,7 +161,8 @@ class Attribute {

 private:
  Variant<int, float, string, vector<int>, vector<float>, vector<string>, bool,
-          vector<bool>, BlockDesc *, int64_t>
+          vector<bool>, BlockDesc *, vector<BlockDesc *>, int64_t,
+          vector<int64_t>>
      variant_;
 };


--- a/src/framework/cl/cl_engine.cpp
+++ b/src/framework/cl/cl_engine.cpp
@@ -27,9 +27,9 @@ bool CLEngine::Init() {
    return true;
  }
  cl_int status;
-  SetPlatform();
-  SetClDeviceId();
-
+  bool is_setplatform_success = SetPlatform();
+  bool is_setcldeviceid_success = SetClDeviceId();
+  is_init_success_ = is_setplatform_success && is_setcldeviceid_success;
  initialized_ = true;
  return initialized_;
  //  setClCommandQueue();
@@ -44,11 +44,14 @@ CLEngine *CLEngine::Instance() {
  return &cl_engine_;
 }

+bool CLEngine::isInitSuccess() { return is_init_success_; }
 bool CLEngine::SetPlatform() {
  platform_ = NULL;      // the chosen platform
  cl_uint numPlatforms;  // the NO. of platforms
  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
-
+  if (status != CL_SUCCESS) {
+    return false;
+  }
  /**For clarity, choose the first available platform. */
  if (numPlatforms > 0) {
    cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
@@ -56,10 +59,10 @@ bool CLEngine::SetPlatform() {
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    platform_ = platforms[0];
    free(platforms);
-    return true;
-  } else {
-    return false;
+    return status == CL_SUCCESS;
  }
+
+  return false;
 }

 bool CLEngine::SetClDeviceId() {
@@ -67,13 +70,15 @@ bool CLEngine::SetClDeviceId() {
  devices_ = NULL;
  cl_int status =
      clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-
+  if (status != CL_SUCCESS) {
+    return false;
+  }
  if (numDevices > 0) {
    devices_ = reinterpret_cast<cl_device_id *>(
        malloc(numDevices * sizeof(cl_device_id)));
    status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
                            NULL);
-    return true;
+    return status == CL_SUCCESS;
  }
  return false;
 }

--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -31,7 +31,7 @@ class CLEngine {
  static CLEngine *Instance();

  bool Init();
-
+  bool isInitSuccess();
  std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
    cl_int status;
    cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
@@ -51,6 +51,20 @@ class CLEngine {
    return std::move(command_queue_ptr);
  }

+  cl_context getContext() {
+    if (context_ == nullptr) {
+      context_ = CreateContext();
+    }
+    return context_.get();
+  }
+
+  cl_command_queue getClCommandQueue() {
+    if (command_queue_ == nullptr) {
+      command_queue_ = CreateClCommandQueue(getContext());
+    }
+    return command_queue_.get();
+  }
+
  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
      cl_context context, std::string file_name) {
    FILE *file = fopen(file_name.c_str(), "rb");
@@ -137,6 +151,11 @@ class CLEngine {
  std::string cl_path_;
  std::unique_ptr<_cl_program, CLProgramDeleter> program_;

+  std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr;
+
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ =
+      nullptr;
+
  //  bool SetClContext();

  //  bool SetClCommandQueue();
@@ -144,6 +163,7 @@ class CLEngine {
  //  bool LoadKernelFromFile(const char *kernel_file);

  //  bool BuildProgram();
+  bool is_init_success_ = false;
 };

 }  // namespace framework

--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -29,12 +29,12 @@ namespace framework {
 class CLScope {
 public:
  CLScope() {
-    CLEngine *engin = CLEngine::Instance();
-    context_ = engin->CreateContext();
-    command_queue_ = engin->CreateClCommandQueue(context_.get());
+    CLEngine *engine = CLEngine::Instance();
+    context_ = engine->getContext();
+    command_queue_ = engine->getClCommandQueue();
  }

-  cl_command_queue CommandQueue() { return command_queue_.get(); }
+  cl_command_queue CommandQueue() { return command_queue_; }

  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
      const std::string &kernel_name, const std::string &file_name) {
@@ -49,7 +49,7 @@ class CLScope {
    return std::move(kernel);
  }

-  cl_context Context() { return context_.get(); }
+  cl_context Context() { return context_; }

  cl_program Program(const std::string &file_name) {
    auto it = programs_.find(file_name);
@@ -58,7 +58,7 @@ class CLScope {
    }

    auto program = CLEngine::Instance()->CreateProgramWith(
-        context_.get(),
+        context_,
        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);

    DLOG << " --- begin build program -> " << file_name << " --- ";
@@ -72,8 +72,8 @@ class CLScope {

 private:
  cl_int status_;
-  std::unique_ptr<_cl_context, CLContextDeleter> context_;
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
+  cl_context context_;
+  cl_command_queue command_queue_;
  std::unordered_map<std::string,
                     std::unique_ptr<_cl_program, CLProgramDeleter>>
      programs_;

--- a/src/framework/context.h
+++ b/src/framework/context.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if _OPENMP
+#include <omp.h>
+#endif
+
+#define MOBILE_MAX_CPU_NUM 8
+
+namespace paddle_mobile {
+namespace framework {
+
+struct CPUContext {
+ private:
+  CPUContext() : num_cpus(4), num_threads(1) {
+    // TODO(hjchen2)
+    for (int i = 0; i < num_cpus; ++i) {
+      cpu_frequencies[i] = 2400;      // 2400 MHz
+      max_cpu_frequencies[i] = 2400;  // 2400 MHz
+    }
+    //    L1_cache = 32000;    // 32K
+    L1_cache = 32 * 1024;
+    L2_cache = 2000000;  // 2M
+                         //    L2_cache = 512000;
+  }
+
+ public:
+  void set_num_threads(int threads) {
+#if _ONENMP
+    omp_set_num_threads(threads);
+    if (threads <= omp_get_max_threads()) {
+      num_threads = threads;
+    } else {
+      num_threads = omp_get_max_threads();
+    }
+#endif
+    num_threads = (num_threads > 1) ? num_threads : 1;
+  }
+
+  virtual ~CPUContext() {}
+
+ public:
+  static CPUContext* Context() {
+    static CPUContext* ctx = new CPUContext;
+    return ctx;
+  }
+
+  int num_cpus;
+  int num_threads;
+  int cpu_frequencies[MOBILE_MAX_CPU_NUM];
+  int max_cpu_frequencies[MOBILE_MAX_CPU_NUM];
+
+  int L1_cache;
+  int L2_cache;
+};
+
+inline void set_global_num_threads(int threads) {
+  CPUContext::Context()->set_num_threads(threads);
+}
+
+inline int get_global_num_threads() {
+  return CPUContext::Context()->num_threads;
+}
+
+}  // namespace framework
+}  // namespace paddle_mobile
--- a/src/framework/data_layout.h
+++ b/src/framework/data_layout.h
@@ -42,6 +42,7 @@ inline DataLayout StringToDataLayout(const std::string &str) {
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
  }
+  return DataLayout::kNCHW;
 }

 inline std::string DataLayoutToString(const DataLayout &data_layout) {

--- a/src/framework/dim.h
+++ b/src/framework/dim.h
@@ -82,6 +82,8 @@ struct Dim<0> {
  int64_t &operator[](int idx);

  int64_t operator[](int idx) const;
+
+  int64_t head;
 };

 namespace {
@@ -131,6 +133,7 @@ int64_t &indexer(Dim<D> &dim, int idx) {
 template <>
 int64_t &indexer<0>(Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  return dim.head;
 }

 template <int D>
@@ -147,6 +150,7 @@ int64_t indexer(const Dim<D> &dim, int idx) {
 template <>
 int64_t indexer<0>(const Dim<0> &dim, int idx) {
  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
+  return dim.head;
 }

 }  // namespace

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -36,6 +36,8 @@ class Executor {
           paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
           const bool use_optimize = true, const bool lod_mode = false);

+  void SetThreadNum(int threads);
+
  PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
  PMStatus Predict(
      const std::vector<std::pair<std::string, LoDTensor>> &inputs);
@@ -49,16 +51,15 @@ class Executor {

  std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);

+  void FeedTensorData(const std::vector<framework::Tensor> &v);
+  void GetTensorResults(std::vector<framework::Tensor *> *v);
+
 #ifdef PADDLE_MOBILE_FPGA
  void InjectVariable(const Tensor &t, std::string var_name);
  void FeedData(const Tensor &t);
  void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);
-
  void GetResults(std::vector<void *> *v);
-  void GetTensorResults(std::vector<framework::Tensor *> *v);
  framework::Tensor *GetTensorByName(const std::string &name);
-
  std::shared_ptr<Tensor> FetchResult(int id = -1);
  void Predict_From_To(int start = 0, int end = -1);
  void Predict_From(int start);
@@ -68,8 +69,9 @@ class Executor {
 protected:
  Executor() = default;

-  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var,
-                      LoDTensor *tensor) const;
+  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc,
+                      Variable *var) const;
+  void InitFeedFetchList();
  void InitMemory();
  void InitCombineMemory();
  void InitNoPersistableMemory(const Tensor &input_tensor);
@@ -85,10 +87,9 @@ class Executor {
  PaddleMobileConfigInternal config_;
  Program<Device> program_;
  std::shared_ptr<ProgramDesc> program_desc_;
-  typedef std::shared_ptr<OperatorBase<Device>> OperatorBasePtr;
-  std::vector<std::vector<OperatorBasePtr>> ops_of_block_;
-  // operators list
-  std::vector<OperatorBasePtr> ops_list_;
+  std::vector<std::shared_ptr<OperatorBase<Device>>> ops_of_block0_;
+  std::unordered_map<std::string, int> feed_indices_;
+  std::unordered_map<std::string, int> fetch_indices_;

  // for super resoltion
  DDim input_dim_last_;

--- a/src/framework/framework.pb-c.c
+++ b/src/framework/framework.pb-c.c
@@ -13,13 +13,6 @@ void paddle_mobile__framework__proto__version__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__version__get_packed_size(
-    const PaddleMobile__Framework__Proto__Version *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__version__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__Version *
 paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -54,13 +47,6 @@ void paddle_mobile__framework__proto__op_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -95,13 +81,6 @@ void paddle_mobile__framework__proto__op_proto__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__OpProto *
 paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -162,13 +141,6 @@ void paddle_mobile__framework__proto__var_type__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__VarType *
 paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -191,13 +163,6 @@ void paddle_mobile__framework__proto__var_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__VarDesc *
 paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -220,13 +185,6 @@ void paddle_mobile__framework__proto__block_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__BlockDesc *
 paddle_mobile__framework__proto__block_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -248,13 +206,6 @@ void paddle_mobile__framework__proto__program_desc__init(
      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
  *message = init_value;
 }
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
 PaddleMobile__Framework__Proto__ProgramDesc *
 paddle_mobile__framework__proto__program_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -310,7 +261,7 @@ const ProtobufCMessageDescriptor
        NULL /* reserved[123] */
 };
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[13] = {
+    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[14] = {
        {
            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
            0, /* quantifier_offset */
@@ -405,6 +356,13 @@ static const ProtobufCFieldDescriptor
            NULL, NULL, 0, /* flags */
            0, NULL, NULL  /* reserved1,reserved2, etc */
        },
+        {
+            "longs", 15, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_longs),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, longs), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
@@ -417,6 +375,7 @@ static const unsigned
        2,  /* field[2] = i */
        5,  /* field[5] = ints */
        11, /* field[11] = l */
+        13, /* field[13] = longs */
        0,  /* field[0] = name */
        4,  /* field[4] = s */
        7,  /* field[7] = strings */
@@ -424,7 +383,7 @@ static const unsigned
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 13}};
+        {1, 0}, {10, 8}, {0, 14}};
 const ProtobufCMessageDescriptor
    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -433,7 +392,7 @@ const ProtobufCMessageDescriptor
        "PaddleMobile__Framework__Proto__OpDesc__Attr",
        "paddle_mobile.framework.proto",
        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        13,
+        14,
        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
        2,
@@ -1448,7 +1407,7 @@ const ProtobufCMessageDescriptor
        NULL /* reserved[123] */
 };
 static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[11] = {
+    paddle_mobile__framework__proto__attr_type__enum_values_by_number[12] = {
        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
@@ -1460,15 +1419,16 @@ static const ProtobufCEnumValue
        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
        {"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10},
+        {"LONGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS", 11},
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 11}};
+                                                                  {0, 12}};
 static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[11] = {
+    paddle_mobile__framework__proto__attr_type__enum_values_by_name[12] = {
        {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7},
        {"FLOAT", 1}, {"FLOATS", 4},  {"INT", 0},     {"INTS", 3},
-        {"LONG", 9},  {"STRING", 2},  {"STRINGS", 5},
+        {"LONG", 9},  {"LONGS", 11},  {"STRING", 2},  {"STRINGS", 5},
 };
 const ProtobufCEnumDescriptor
    paddle_mobile__framework__proto__attr_type__descriptor = {
@@ -1477,9 +1437,9 @@ const ProtobufCEnumDescriptor
        "AttrType",
        "PaddleMobile__Framework__Proto__AttrType",
        "paddle_mobile.framework.proto",
-        11,
+        12,
        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        11,
+        12,
        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
        1,
        paddle_mobile__framework__proto__attr_type__value_ranges,

--- a/src/framework/framework.pb-c.h
+++ b/src/framework/framework.pb-c.h
@@ -102,8 +102,9 @@ typedef enum _PaddleMobile__Framework__Proto__AttrType {
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS =
-      10 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS = 10,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS =
+      11 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
 } PaddleMobile__Framework__Proto__AttrType;

@@ -152,13 +153,15 @@ struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
  int64_t l;
  size_t n_blocks_idx;
  int32_t *blocks_idx;
+  size_t n_longs;
+  int64_t *longs;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
  {                                                                            \
    PROTOBUF_C_MESSAGE_INIT(                                                   \
        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL          \
+        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL, 0, NULL \
  }

 struct _PaddleMobile__Framework__Proto__OpDesc__Var {
@@ -417,8 +420,6 @@ struct _PaddleMobile__Framework__Proto__ProgramDesc {
 /* PaddleMobile__Framework__Proto__Version methods */
 void paddle_mobile__framework__proto__version__init(
    PaddleMobile__Framework__Proto__Version *message);
-size_t paddle_mobile__framework__proto__version__get_packed_size(
-    const PaddleMobile__Framework__Proto__Version *message);
 PaddleMobile__Framework__Proto__Version *
 paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -435,8 +436,6 @@ void paddle_mobile__framework__proto__op_desc__var__init(
 /* PaddleMobile__Framework__Proto__OpDesc methods */
 void paddle_mobile__framework__proto__op_desc__init(
    PaddleMobile__Framework__Proto__OpDesc *message);
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message);
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -453,8 +452,6 @@ void paddle_mobile__framework__proto__op_proto__attr__init(
 /* PaddleMobile__Framework__Proto__OpProto methods */
 void paddle_mobile__framework__proto__op_proto__init(
    PaddleMobile__Framework__Proto__OpProto *message);
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message);
 PaddleMobile__Framework__Proto__OpProto *
 paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -483,8 +480,6 @@ void paddle_mobile__framework__proto__var_type__tuple__init(
 /* PaddleMobile__Framework__Proto__VarType methods */
 void paddle_mobile__framework__proto__var_type__init(
    PaddleMobile__Framework__Proto__VarType *message);
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message);
 PaddleMobile__Framework__Proto__VarType *
 paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -495,8 +490,6 @@ void paddle_mobile__framework__proto__var_type__free_unpacked(
 /* PaddleMobile__Framework__Proto__VarDesc methods */
 void paddle_mobile__framework__proto__var_desc__init(
    PaddleMobile__Framework__Proto__VarDesc *message);
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message);
 PaddleMobile__Framework__Proto__VarDesc *
 paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -507,8 +500,6 @@ void paddle_mobile__framework__proto__var_desc__free_unpacked(
 /* PaddleMobile__Framework__Proto__BlockDesc methods */
 void paddle_mobile__framework__proto__block_desc__init(
    PaddleMobile__Framework__Proto__BlockDesc *message);
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message);
 PaddleMobile__Framework__Proto__BlockDesc *
 paddle_mobile__framework__proto__block_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
@@ -518,8 +509,6 @@ void paddle_mobile__framework__proto__block_desc__free_unpacked(
 /* PaddleMobile__Framework__Proto__ProgramDesc methods */
 void paddle_mobile__framework__proto__program_desc__init(
    PaddleMobile__Framework__Proto__ProgramDesc *message);
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message);
 PaddleMobile__Framework__Proto__ProgramDesc *
 paddle_mobile__framework__proto__program_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);

--- a/src/framework/framework.proto
+++ b/src/framework/framework.proto
@@ -35,6 +35,7 @@ enum AttrType {
  BLOCK = 8;
  LONG = 9;
  BLOCKS = 10;
+  LONGS = 11;
 }

 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -55,6 +56,7 @@ message OpDesc {
    optional int32 block_idx = 12;
    optional int64 l = 13;
    repeated int32 blocks_idx = 14;
+    repeated int64 longs = 15;
  };

  message Var {

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -23,13 +23,13 @@ limitations under the License. */
 #define LOAD_CPU_OP(op_type)
 #endif

-#ifdef PADDLE_MOBILE_MALI_GPU
-#define LOAD_MALI_GPU_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type##_##mali_gpu();                     \
-  static int use_op_itself_##op_type##_##mali_gpu __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##mali_gpu()
+#ifdef PADDLE_MOBILE_CL
+#define LOAD_GPU_CL_OP(op_type)                                       \
+  extern int TouchOpRegistrar_##op_type##_##cl();                     \
+  static int use_op_itself_##op_type##_##cl __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##cl()
 #else
-#define LOAD_MALI_GPU_OP(op_type)
+#define LOAD_GPU_CL_OP(op_type)
 #endif

 #ifdef PADDLE_MOBILE_FPGA
@@ -46,9 +46,9 @@ limitations under the License. */
  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
      TouchFusionMatcherRegistrar_##op_type();

-#define LOAD_OP(op_type)     \
-  LOAD_CPU_OP(op_type);      \
-  LOAD_MALI_GPU_OP(op_type); \
+#define LOAD_OP(op_type)   \
+  LOAD_CPU_OP(op_type);    \
+  LOAD_GPU_CL_OP(op_type); \
  LOAD_FPGA_OP(op_type);

 #define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
@@ -65,76 +65,72 @@ limitations under the License. */
 LOAD_OP(feed)
 LOAD_OP(fetch)
 #ifdef FILL_CONSTANT_OP
-LOAD_OP(fill_constant)
+LOAD_OP2(fill_constant, CPU, FPGA)
 #endif
 #ifdef BATCHNORM_OP
-LOAD_OP2(batch_norm, CPU, MALI_GPU);
+LOAD_OP2(batch_norm, CPU, GPU_CL);
 #endif
 #ifdef BILINEAR_INTERP_OP
 LOAD_OP1(bilinear_interp, CPU);
 #endif
 #ifdef BOXCODER_OP
-LOAD_OP1(box_coder, CPU);
+LOAD_OP2(box_coder, CPU, GPU_CL);
 #endif
 #ifdef CONCAT_OP
-LOAD_OP3(concat, CPU, MALI_GPU, FPGA);
+LOAD_OP3(concat, CPU, GPU_CL, FPGA);
 #endif
 #ifdef CONV_OP
-LOAD_OP3(conv2d, CPU, MALI_GPU, FPGA);
+LOAD_OP3(conv2d, CPU, GPU_CL, FPGA);
 #endif
 #ifdef LRN_OP
-LOAD_OP2(lrn, CPU, MALI_GPU);
+LOAD_OP2(lrn, CPU, GPU_CL);
 #endif
 #ifdef SIGMOID_OP
 LOAD_OP1(sigmoid, CPU);
 #endif
 #ifdef FUSION_FC_RELU_OP
-LOAD_OP3(fusion_fc_relu, CPU, MALI_GPU, FPGA);
+LOAD_OP2(fusion_fc_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_fc_relu);
 #endif
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-LOAD_OP3(fusion_elementwise_add_relu, CPU, MALI_GPU, FPGA);
+LOAD_OP2(fusion_elementwise_add_relu, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
 #endif
 #ifdef SPLIT_OP
 LOAD_OP1(split, CPU);
 #endif
 #ifdef RESIZE_OP
-LOAD_OP2(resize, CPU, MALI_GPU);
+LOAD_OP1(resize, CPU);
 #endif
 #ifdef FUSION_CONVADDBNRELU_OP
-LOAD_OP2(fusion_conv_add_bn_relu, CPU, FPGA);
+LOAD_OP3(fusion_conv_add_bn_relu, CPU, GPU_CL, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
 #endif
 #ifdef RESHAPE_OP
-LOAD_OP2(reshape, CPU, MALI_GPU);
+LOAD_OP2(reshape, CPU, GPU_CL);
 #endif
 #ifdef RESHAPE2_OP
-LOAD_OP2(reshape2, CPU, MALI_GPU);
+LOAD_OP1(reshape2, CPU);
 #endif
 #ifdef TRANSPOSE_OP
-LOAD_OP1(transpose, CPU);
+LOAD_OP2(transpose, CPU, GPU_CL);
 #endif
 #ifdef TRANSPOSE2_OP
 LOAD_OP1(transpose2, CPU);
 #endif
 #ifdef PRIORBOX_OP
-LOAD_OP1(prior_box, CPU);
+LOAD_OP2(prior_box, CPU, GPU_CL);
 #endif
 #ifdef FUSION_CONVADDRELU_OP
-LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
+LOAD_OP3(fusion_conv_add_relu, CPU, GPU_CL, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_relu);
 #endif
-#ifdef FUSION_CONVADDADDPRELU_OP
-LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
-#endif
 #ifdef FUSION_CONVADD_OP
-LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
+LOAD_OP2(fusion_conv_add, CPU, GPU_CL);
 LOAD_FUSION_MATCHER(fusion_conv_add);
 #endif
 #ifdef SOFTMAX_OP
-LOAD_OP2(softmax, CPU, MALI_GPU);
+LOAD_OP2(softmax, CPU, GPU_CL);
 #endif
 #ifdef SHAPE_OP
 LOAD_OP1(shape, CPU);
@@ -146,23 +142,23 @@ LOAD_OP1(depthwise_conv2d, CPU);
 LOAD_OP1(conv2d_transpose, CPU);
 #endif
 #ifdef SCALE_OP
-LOAD_OP2(scale, CPU, MALI_GPU);
+LOAD_OP1(scale, CPU);
 #endif
 #ifdef ELEMENTWISEADD_OP
-LOAD_OP2(elementwise_add, CPU, MALI_GPU);
+LOAD_OP2(elementwise_add, CPU, GPU_CL);
 #endif
 #ifdef PRELU_OP
-LOAD_OP2(prelu, CPU, MALI_GPU);
+LOAD_OP1(prelu, CPU);
 #endif
 #ifdef FLATTEN_OP
 LOAD_OP1(flatten, CPU);
 #endif
 #ifdef FUSION_CONVBNADDRELU_OP
-LOAD_OP2(fusion_conv_bn_add_relu, CPU, FPGA);
+LOAD_OP3(fusion_conv_bn_add_relu, CPU, GPU_CL, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn_add_relu);
 #endif
 #ifdef FUSION_CONVBNRELU_OP
-LOAD_OP2(fusion_conv_bn_relu, CPU, FPGA);
+LOAD_OP3(fusion_conv_bn_relu, CPU, GPU_CL, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn_relu);
 #endif
 #ifdef GRU_OP
@@ -176,27 +172,23 @@ LOAD_OP2(fusion_conv_add_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_add_bn);
 #endif
 #ifdef DROPOUT_OP
-LOAD_OP2(dropout, CPU, FPGA);
-#endif
-#ifdef FUSION_CONVADDPRELU_OP
-LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
+LOAD_OP3(dropout, CPU, GPU_CL, FPGA);
 #endif
 #ifdef FUSION_DWCONVBNRELU_OP
-LOAD_OP1(fusion_dwconv_bn_relu, CPU);
+LOAD_OP2(fusion_dwconv_bn_relu, CPU, GPU_CL);
 LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
 #endif
 #ifdef CRF_OP
 LOAD_OP1(crf_decoding, CPU);
 #endif
 #ifdef MUL_OP
-LOAD_OP2(mul, CPU, MALI_GPU);
+LOAD_OP2(mul, CPU, GPU_CL);
 #endif
 #ifdef NORM_OP
 LOAD_OP1(norm, CPU);
 #endif
 #ifdef RELU_OP
-LOAD_OP2(relu, CPU, MALI_GPU);
+LOAD_OP2(relu, CPU, GPU_CL);
 LOAD_OP1(relu6, CPU);
 #endif
 #ifdef IM2SEQUENCE_OP
@@ -206,14 +198,14 @@ LOAD_OP1(im2sequence, CPU);
 LOAD_OP1(lookup_table, CPU);
 #endif
 #ifdef FUSION_FC_OP
-LOAD_OP3(fusion_fc, CPU, MALI_GPU, FPGA);
+LOAD_OP3(fusion_fc, CPU, GPU_CL, FPGA);
 LOAD_FUSION_MATCHER(fusion_fc);
 #endif
 #ifdef POOL_OP
-LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
+LOAD_OP3(pool2d, CPU, GPU_CL, FPGA);
 #endif
 #ifdef MULTICLASSNMS_OP
-LOAD_OP1(multiclass_nms, CPU);
+LOAD_OP2(multiclass_nms, CPU, GPU_CL);
 #endif
 #ifdef POLYGONBOXTRANSFORM_OP
 LOAD_OP1(polygon_box_transform, CPU);
@@ -225,7 +217,7 @@ LOAD_OP1(sum, CPU);
 LOAD_OP1(elementwise_mul, CPU);
 #endif
 #ifdef SLICE_OP
-LOAD_OP2(slice, CPU, MALI_GPU);
+LOAD_OP1(slice, CPU);
 #endif
 #ifdef FUSION_CONVBN_OP
 LOAD_OP2(fusion_conv_bn, CPU, FPGA);
@@ -324,3 +316,15 @@ LOAD_OP1(psroi_pool, CPU);
 #ifdef ROI_PERSPECTIVE_OP
 LOAD_OP1(roi_perspective_transform, CPU);
 #endif
+#ifdef BEAM_SEARCH_OP
+LOAD_OP1(beam_search, CPU);
+#endif
+#ifdef BEAM_SEARCH_DECODE_OP
+LOAD_OP1(beam_search_decode, CPU);
+#endif
+#ifdef PAD2D_OP
+LOAD_OP1(pad2d, CPU);
+#endif
+#ifdef ONE_HOT_OP
+LOAD_OP1(one_hot, CPU);
+#endif
--- a/src/framework/lod_tensor.h
+++ b/src/framework/lod_tensor.h
@@ -221,6 +221,8 @@ inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
    } else if (tensor.type() == typeid(int32_t)) {
      printer << tensor.data<int32_t>()[i] << " ";
+    } else if (tensor.type() == typeid(bool)) {
+      printer << tensor.data<bool>()[i] << " ";
    }
  }
 #endif  // PADDLE_MOBILE_FPGA

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -58,8 +58,7 @@ struct OpInfoFiller {
  void operator()(const std::string& op_type, OpInfo<Dtype>* info) const {
    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
                        const VariableNameMap& outputs,
-                        const AttributeMap& attrs,
-                        std::shared_ptr<Scope> scope) {
+                        const AttributeMap& attrs, framework::Scope* scope) {
      return new T(type, inputs, outputs, attrs, scope);
    };
  }
@@ -91,7 +90,7 @@ class OpRegistry {
  static std::shared_ptr<OperatorBase<Dtype>> CreateOp(
      const std::string& type, const VariableNameMap& inputs,
      const VariableNameMap& outputs, const AttributeMap attrs,
-      std::shared_ptr<paddle_mobile::framework::Scope> scope) {
+      paddle_mobile::framework::Scope* scope) {
    auto& info = OpInfoMap<Dtype>::Instance()->Get(type);
    auto op = info.Creator()(type, inputs, outputs, attrs, scope);
    return std::shared_ptr<OperatorBase<Dtype>>(op);
@@ -116,9 +115,6 @@ class OpRegistry {
 #define REGISTER_OPERATOR_CPU(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);

-#define REGISTER_OPERATOR_MALI_GPU(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, mali_gpu, paddle_mobile::GPU_MALI);
-
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);


--- a/src/framework/operator.cpp
+++ b/src/framework/operator.cpp
@@ -43,16 +43,13 @@ OperatorBase<Dtype>::OperatorBase(const std::string &type,
                                  const VariableNameMap &inputs,
                                  const VariableNameMap &outputs,
                                  const AttributeMap &attrs,
-                                  std::shared_ptr<Scope> scope)
+                                  framework::Scope *scope)
    : type_(type),
      inputs_(inputs),
      outputs_(outputs),
      attrs_(attrs),
      scope_(scope) {
  CheckAllInputOutputSet();
-#ifdef PADDLE_MOBILE_FPGA
-  InsertTensors();
-#endif
 }

 template <typename Dtype>
@@ -67,30 +64,28 @@ void OperatorBase<Dtype>::Run() {
  for (const auto key : input_keys) {
    auto var_vec_in = inputs_.at(key);
    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto vari = this->scope_->FindVar(var_vec_in[i]);
-      if (vari->IsInitialized()) {
-        const Tensor *tensor = vari->template Get<framework::LoDTensor>();
-        if (tensor) {
-          DLOG << type_ << " input- " << key << "=" << *tensor;
+      auto var = this->scope_->FindVar(var_vec_in[i]);
+      if (var->IsInitialized() &&
+          var->template IsType<framework::LoDTensor>()) {
+        const Tensor *tensor = var->template Get<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
 #ifdef PADDLE_MOBILE_FPGA
-          DLOG << var_vec_in[i];
+        DLOG << var_vec_in[i];
 #endif
-        }
      }
    }
  }
  for (const auto key : GetOutKeys()) {
    auto var_vec_out = outputs_.at(key);
    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto vari = scope_->FindVar(var_vec_out[i]);
-      if (vari->IsInitialized()) {
-        const Tensor *tensor = vari->template Get<framework::LoDTensor>();
-        if (tensor) {
-          DLOG << type_ << " output- " << key << "=" << *tensor;
+      auto var = scope_->FindVar(var_vec_out[i]);
+      if (var->IsInitialized() &&
+          var->template IsType<framework::LoDTensor>()) {
+        const Tensor *tensor = var->template Get<framework::LoDTensor>();
+        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
 #ifdef PADDLE_MOBILE_FPGA
-          DLOG << var_vec_out[i];
+        DLOG << var_vec_out[i];
 #endif
-        }
      }
    }
  }

--- a/src/framework/operator.h
+++ b/src/framework/operator.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once

 #include <map>
-#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -58,7 +57,7 @@ class OperatorBase {
 public:
  OperatorBase(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs, const AttributeMap &attrs,
-               std::shared_ptr<Scope> scope);
+               framework::Scope *scope);
  virtual ~OperatorBase() {}

  virtual void Init() = 0;
@@ -81,11 +80,10 @@ class OperatorBase {
  }
 #ifdef PADDLE_MOBILE_FPGA
  void InsertTensors();
-  void ChangeNameMap(string key, std::vector<string> value);
 #endif

 protected:
-  std::shared_ptr<Scope> scope_;
+  framework::Scope *scope_;
  std::string type_;
  VariableNameMap inputs_;
  VariableNameMap outputs_;
@@ -98,35 +96,15 @@ class OperatorBase {
 template <typename Dtype, typename ParamType, typename KernelType>
 class OperatorWithKernel : public OperatorBase<Dtype> {
 public:
-#ifndef PADDLE_MOBILE_FPGA1
  OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     std::shared_ptr<Scope> scope)
+                     framework::Scope *scope)
      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, scope.get()) {
+        param_(inputs, outputs, attrs, scope) {
 #ifdef PADDLE_MOBILE_CL
    kernel_.InitCLHelper(scope->GetCLScpoe());
 #endif
  }
-#else
-  OperatorWithKernel(const std::string &type, const VariableNameMap inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     std::shared_ptr<Scope> scope)
-      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope) {
-    static int feed_num = 0;
-    static int fetch_num = 0;
-    if (type == "feed") {
-      auto new_name = string("feed") + std::to_string(feed_num++);
-      auto var = scope->Var(new_name);
-      (const_cast<VariableNameMap &>(inputs)).at("X") = {string(new_name)};
-    } else if (type == "fetch") {
-      auto new_name = string("fetch") + std::to_string(fetch_num++);
-      auto var = scope->Var(new_name);
-      (const_cast<VariableNameMap &>(outputs)).at("Out") = {string(new_name)};
-    }
-    param_ = ParamType(inputs, outputs, attrs, *scope);
-  }
-#endif
  virtual void RunImpl() { this->kernel_.Compute(this->param_); }

  virtual void InferShape() const = 0;
@@ -152,13 +130,6 @@ class OpKernelBase {
  }
 #endif

-#ifdef PADDLE_McOBILE_MALI_GPU
-  OpKernelBase() { acl_op_ = nullptr; }
-  void *GetAclOp() const { return acl_op_; }
-  void SetAclOp(void *op, void *ob) const {
-    reinterpret_cast<OpKernelBase<Dtype, P> *>(ob)->acl_op_ = op;
-  }
-#endif
  virtual void Compute(const P &para) = 0;
  virtual bool Init(P *para) { return true; }
  virtual ~OpKernelBase() = default;
@@ -169,9 +140,6 @@ class OpKernelBase {
 #endif

 private:
-#ifdef PADDLE_MOBILE_MALI_GPU
-  void *acl_op_;
-#endif
 };

 class FusionOpMatcher {
@@ -198,21 +166,20 @@ class FusionOpMatcher {
  std::shared_ptr<OpDesc> new_opdesc_;
 };

-#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                          \
-  template <typename DeviceType, typename T>                                 \
-  class OpName##Op : public framework::OperatorWithKernel<                   \
-                         DeviceType, OpParam<DeviceType>,                    \
-                         operators::OpKernel<DeviceType, T>> {               \
-   public:                                                                   \
-    OpName##Op(const std::string &type, const VariableNameMap &inputs,       \
-               const VariableNameMap &outputs,                               \
-               const framework::AttributeMap &attrs,                         \
-               std::shared_ptr<framework::Scope> scope)                      \
-        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,     \
-                                        operators::OpKernel<DeviceType, T>>( \
-              type, inputs, outputs, attrs, scope) {}                        \
-                                                                             \
-    void InferShape() const override;                                        \
+#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                           \
+  template <typename DeviceType, typename T>                                  \
+  class OpName##Op : public framework::OperatorWithKernel<                    \
+                         DeviceType, OpParam<DeviceType>,                     \
+                         operators::OpKernel<DeviceType, T>> {                \
+   public:                                                                    \
+    OpName##Op(const std::string &type, const VariableNameMap &inputs,        \
+               const VariableNameMap &outputs,                                \
+               const framework::AttributeMap &attrs, framework::Scope *scope) \
+        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,      \
+                                        operators::OpKernel<DeviceType, T>>(  \
+              type, inputs, outputs, attrs, scope) {}                         \
+                                                                              \
+    void InferShape() const override;                                         \
  };

 #define DECLARE_KERNEL(OpName, OpParam)                                   \
@@ -228,7 +195,7 @@ class FusionOpMatcher {
  cls(const std::string &type, const ::paddle_mobile::VariableNameMap &inputs, \
      const ::paddle_mobile::VariableNameMap &outputs,                         \
      const ::paddle_mobile::framework::AttributeMap &attrs,                   \
-      std::shared_ptr<::paddle_mobile::framework::Scope> scope)                \
+      ::paddle_mobile::framework::Scope *scope)                                \
      : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}

 }  // namespace framework

--- a/src/framework/program/op_desc.cpp
+++ b/src/framework/program/op_desc.cpp
@@ -42,9 +42,15 @@ OpDesc::OpDesc(PaddleMobile__Framework__Proto__OpDesc *desc) {
    PaddleMobile__Framework__Proto__OpDesc__Attr *attr = desc->attrs[k];
    std::string attr_name(attr->name);
    attrs_[attr_name] = Attribute::GetAttrValue(attr);
+    proto_attrs_.push_back(*attr);
  }
 }

+const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
+    &OpDesc::GetProtoAttr() const {
+  return proto_attrs_;
+}
+
 const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
  return inputs_.find(name)->second;
 }
@@ -58,6 +64,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
  return it->second;
 }

+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
+  this->attrs_[name].Set<BlockDesc *>(block);
+}
+
+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name].Set<std::vector<BlockDesc *>>(blocks);
+}
+
 std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() {
  return attrs_;
 }

--- a/src/framework/program/op_desc.h
+++ b/src/framework/program/op_desc.h
@@ -29,11 +29,13 @@ class OpDesc {
  friend class ProgramOptimize;
  friend class FusionOpMatcher;
  friend class Node;
+
  explicit OpDesc(PaddleMobile__Framework__Proto__OpDesc *op_desc);
  OpDesc(const OpDesc &op_desc) : type_(op_desc.type_) {
    this->inputs_ = op_desc.inputs_;
    this->outputs_ = op_desc.outputs_;
    this->attrs_ = op_desc.attrs_;
+    this->proto_attrs_ = op_desc.proto_attrs_;
  }

  OpDesc() {}
@@ -41,6 +43,12 @@ class OpDesc {
  const std::vector<std::string> &Output(const std::string &name) const;
  Attribute GetAttr(const std::string &name) const;

+  const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
+      &GetProtoAttr() const;
+
+  void SetBlockAttr(const std::string &name, BlockDesc *block);
+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> block);
+
  VariableNameMap &GetInputs() { return inputs_; }

  VariableNameMap &GetOutputs() { return outputs_; }
@@ -60,6 +68,7 @@ class OpDesc {
  VariableNameMap inputs_;
  VariableNameMap outputs_;
  AttributeMap attrs_;
+  std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr> proto_attrs_;
 };

 Print &operator<<(Print &printer, const OpDesc &op_desc);

--- a/src/framework/program/program_desc.cpp
+++ b/src/framework/program/program_desc.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <string>
 #include <vector>

+#include "framework/program/program_desc.h"
 #include "framework/program/tensor_desc.h"
-#include "program_desc.h"

 namespace paddle_mobile {
 namespace framework {
@@ -25,6 +25,25 @@ ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
  for (int i = 0; i < desc->n_blocks; ++i) {
    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
  }
+  for (auto &block : blocks_) {
+    for (auto op : block->Ops()) {
+      for (const auto &attr : op->GetProtoAttr()) {
+        if (attr.type == PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK) {
+          size_t blk_idx = attr.block_idx;
+          op->SetBlockAttr(attr.name, this->MutableBlock(blk_idx));
+        } else if (attr.type ==
+                   PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS) {
+          size_t n_blocks_idx = attr.n_blocks_idx;
+          int32_t *blks_idx = attr.blocks_idx;
+          std::vector<BlockDesc *> block_descs;
+          for (size_t i = 0; i < n_blocks_idx; ++i) {
+            block_descs.push_back(this->MutableBlock(blks_idx[i]));
+          }
+          op->SetBlocksAttr(attr.name, block_descs);
+        }
+      }
+    }
+  }
 }

 void ProgramDesc::Description(std::string header) {
@@ -60,9 +79,8 @@ void ProgramDesc::Description(std::string header) {
    }

    for (const auto &var_desc : block->Vars()) {
+      LOG(kLOG_DEBUG1) << "var name: " << var_desc->Name();
      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        LOG(kLOG_DEBUG1) << "var name: " << var_desc->Name();
-
        const TensorDesc &tensor_desc = var_desc->Tensor_desc();

        LOG(kLOG_DEBUG2) << "in var tensor desc dims size: "

--- a/src/framework/program/program_desc.h
+++ b/src/framework/program/program_desc.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include <vector>

 #include "common/types.h"
@@ -31,6 +32,14 @@ class ProgramDesc {

  std::shared_ptr<BlockDesc> Block(size_t idx);

+  BlockDesc *MutableBlock(size_t idx) {
+    if (idx == -1) {
+      return nullptr;
+    } else {
+      return blocks_[idx].get();
+    }
+  }
+
  const std::vector<std::shared_ptr<BlockDesc>> &Blocks() { return blocks_; }
  ProgramDesc(const ProgramDesc &program_desc) {
    for (auto &block : program_desc.blocks_) {

--- a/src/framework/scope.h
+++ b/src/framework/scope.h
@@ -32,15 +32,7 @@ class Scope {
  Scope() = default;

  ~Scope() {
-    for (auto &var : vars_) {
-      delete var.second;
-    }
-    vars_.clear();
-    for (auto kid : kids_) {
-      delete kid;
-    }
-    kids_.clear();
-
+    DropKids();
 #ifdef PADDLE_MOBILE_CL
    delete cl_scope_;
 #endif

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -209,8 +209,9 @@ class Tensor : public TensorBase {
  }
  inline void set_type(std::type_index type) { holder_->set_type(type); }
  inline void *get_data() {
-    return (void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());
-  }  // NOLINT
+    return (
+        void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());  // NOLINT
+  }

  inline void *init(std::type_index type) {
    if (holder_ != nullptr) {

--- a/src/framework/tensor_util.h
+++ b/src/framework/tensor_util.h
@@ -14,13 +14,26 @@ limitations under the License. */

 #pragma once
 #include <vector>
+#include "framework/tensor.h"
 #include "memory/t_malloc.h"
-#include "tensor.h"

 namespace paddle_mobile {
 namespace framework {

-void TensorCopy(const Tensor &src, Tensor *dst);
+void TensorCopy(const Tensor& src, Tensor* dst);
+
+template <typename T>
+void TensorFromVector(const std::vector<T>& src, Tensor* dst);
+
+template <typename T>
+void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
+  auto src_ptr = static_cast<const void*>(src.data());
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>());
+  auto size = src.size() * sizeof(T);
+
+  memory::Copy(dst_ptr, src_ptr, size);
+}

 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "io/api_paddle_mobile.h"
+#include <string>
 #include <vector>
 #include "common/enforce.h"
 #include "framework/tensor.h"
@@ -169,7 +170,7 @@ void PaddleMobilePredictor<Device, T>::GetPaddleTensor(const std::string &name,
                                                       PaddleTensor *output) {
  framework::Tensor *t = paddle_mobile_->GetTensorByName(name);
  ConvertTensors(*t, output);
-};
+}

 template <typename Device, typename T>
 void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {

--- a/src/io/api_paddle_mobile.h
+++ b/src/io/api_paddle_mobile.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include <vector>
 #include "common/types.h"
 #include "io/paddle_inference_api.h"

--- a/src/io/jni/paddle_mobile_jni.cpp
+++ b/src/io/jni/paddle_mobile_jni.cpp
@@ -39,8 +39,6 @@ using framework::Tensor;
 using paddle_mobile::CPU;
 using std::string;

-const char *ANDROID_LOG_TAG =
-    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
 paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
 static std::mutex shared_mutex;


--- a/src/io/opencl_interface.cpp
+++ b/src/io/opencl_interface.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_MOBILE_CL
+
+#include "io/opencl_interface.h"
+#include "framework/cl/cl_engine.h"
+#include "framework/cl/cl_scope.h"
+
+namespace paddle_mobile {
+
+cl_context getContext() {
+  return framework::CLEngine::Instance()->getContext();
+}
+
+cl_command_queue getClCommandQueue() {
+  return framework::CLEngine::Instance()->getClCommandQueue();
+}
+
+bool isInitSuccess() {
+  return framework::CLEngine::Instance()->isInitSuccess();
+}
+
+}  // namespace paddle_mobile
+#endif
--- a/src/io/opencl_interface.h
+++ b/src/io/opencl_interface.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_MOBILE_CL
+#include "CL/cl.h"
+
+namespace paddle_mobile {
+
+cl_context getContext();
+cl_command_queue getClCommandQueue();
+bool isInitSuccess();
+
+#endif
+
+}  // namespace paddle_mobile
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -20,6 +20,8 @@ limitations under the License. */
 #endif  // _OPENMP
 #ifdef PADDLE_MOBILE_CL
 #include <CL/cl.h>
+#include <mutex>
+#include "framework/cl/cl_engine.h"
 #include "framework/cl/cl_tensor.h"
 #endif
 #include "operators/math/gemm.h"
@@ -28,9 +30,7 @@ namespace paddle_mobile {

 template <typename Device, typename T>
 void PaddleMobile<Device, T>::SetThreadNum(int num) {
-#ifdef _OPENMP
-  omp_set_num_threads(num);
-#endif
+  executor_->SetThreadNum(num);
 }

 template <typename Device, typename T>
@@ -152,14 +152,14 @@ PMStatus PaddleMobile<Device, T>::Predict() {
 }

 template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const framework::Tensor &input,
-                                   const std::string &var_name) {
+void PaddleMobile<Device, T>::Feed(const std::string &var_name,
+                                   const framework::Tensor &input) {
  executor_->SetInput(input, var_name);
 }

 template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const framework::LoDTensor &input,
-                                   const std::string &var_name) {
+void PaddleMobile<Device, T>::Feed(const std::string &var_name,
+                                   const framework::LoDTensor &input) {
  executor_->SetInput(input, var_name);
 }

@@ -204,11 +204,15 @@ double PaddleMobile<CPU, float>::GetPredictTime() {

  operators::math::Gemm gemm;
  auto time1 = paddle_mobile::time();
-  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-             static_cast<float>(0), c, ldc, false,
-             static_cast<float *>(nullptr));
+  int times = 4;
+  for (int j = 0; j < times; ++j) {
+    gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
+               static_cast<float>(0), c, ldc, false,
+               static_cast<float *>(nullptr));
+  }
+
  auto time2 = paddle_mobile::time();
-  double cost = paddle_mobile::time_diff(time1, time2);
+  double cost = paddle_mobile::time_diff(time1, time2) / times;
  paddle_mobile::memory::Free(a);
  paddle_mobile::memory::Free(b);
  paddle_mobile::memory::Free(c);
@@ -227,16 +231,16 @@ template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
  executor_->FeedData(t);
 }
+
 template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
  executor_->FeedData(v);
-};
-
+}
 template <typename Device, typename T>
 void PaddleMobile<Device, T>::FeedTensorData(
    const std::vector<framework::Tensor> &v) {
  executor_->FeedTensorData(v);
-};
+}

 template <typename Device, typename T>
 void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
@@ -253,7 +257,7 @@ template <typename Device, typename T>
 framework::Tensor *PaddleMobile<Device, T>::GetTensorByName(
    const std::string &name) {
  return executor_->GetTensorByName(name);
-};
+}

 template <typename Device, typename T>
 std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
@@ -289,21 +293,11 @@ void PaddleMobile<Device, T>::SetCLPath(std::string path) {
 template <>
 double PaddleMobile<GPU_CL, float>::GetPredictTime() {
  cl_int status;
-  cl_uint nPlatform;
-  clGetPlatformIDs(0, NULL, &nPlatform);
-  cl_platform_id *listPlatform = reinterpret_cast<cl_platform_id *>(
-      malloc(nPlatform * sizeof(cl_platform_id)));
-  clGetPlatformIDs(nPlatform, listPlatform, NULL);
-  cl_uint nDevice = 0;
-  clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &nDevice);
-  cl_device_id *listDevice =
-      reinterpret_cast<cl_device_id *>(malloc(nDevice * sizeof(cl_device_id)));
-  clGetDeviceIDs(listPlatform[0], CL_DEVICE_TYPE_GPU, nDevice, listDevice,
-                 NULL);
-  cl_context context =
-      clCreateContext(NULL, nDevice, listDevice, NULL, NULL, &status);
-  cl_command_queue queue =
-      clCreateCommandQueue(context, listDevice[0], 0, &status);
+  if (!framework::CLEngine::Instance()->isInitSuccess()) {
+    return -1;
+  }
+  cl_context context = framework::CLEngine::Instance()->getContext();
+  cl_command_queue queue = framework::CLEngine::Instance()->getClCommandQueue();

  int n = 1;
  int c = 3;
@@ -417,7 +411,7 @@ double PaddleMobile<GPU_CL, float>::GetPredictTime() {
  CL_CHECK_ERRORS(status);

  clFinish(queue);
-  queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
+  //  queue = clCreateCommandQueue(context, listDevice[0], 0, &status);

  path = framework::CLEngine::Instance()->GetCLPath() +
         "/cl_kernel/conv_kernel.cl";
@@ -472,15 +466,18 @@ double PaddleMobile<GPU_CL, float>::GetPredictTime() {
  //  cl_event wait_event = param.Input()->GetClEvent();
  size_t global_work_size2[3] = {8, 224, 224};
  auto time1 = paddle_mobile::time();
-  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2,
-                                  NULL, 0, NULL, NULL);
+  int times = 10;
+  for (int i = 0; i < times; ++i) {
+    status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2,
+                                    NULL, 0, NULL, NULL);
+  }
  CL_CHECK_ERRORS(status);
  clFinish(queue);
  auto time2 = paddle_mobile::time();
  paddle_mobile::memory::Free(input);
  paddle_mobile::memory::Free(filter);
  if (status == CL_SUCCESS) {
-    return paddle_mobile::time_diff(time1, time2);
+    return paddle_mobile::time_diff(time1, time2) / times;
  } else {
    return -1;
  }

--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -33,7 +33,7 @@ namespace paddle_mobile {
 template <typename Device, typename T = float>
 class PaddleMobile {
 public:
-  PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
+  explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
 #ifndef PADDLE_MOBILE_CL
    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
@@ -46,7 +46,7 @@ class PaddleMobile {
    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
 #endif
  }
-  ~PaddleMobile() {}
+  virtual ~PaddleMobile() { Clear(); }

  PMStatus Load(const std::string &dirname, const bool optimize = false,
                const bool quantification = false, const int batch_size = 1,
@@ -69,8 +69,8 @@ class PaddleMobile {
                         const std::vector<int64_t> &dims);
  PMStatus Predict();

-  void Feed(const framework::LoDTensor &input, const std::string &var_name);
-  void Feed(const framework::Tensor &input, const std::string &var_name);
+  void Feed(const std::string &var_name, const framework::LoDTensor &input);
+  void Feed(const std::string &var_name, const framework::Tensor &input);

  typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
  LoDTensorPtr Fetch(const std::string &var_name);

--- a/src/operators/activation_op.cpp
+++ b/src/operators/activation_op.cpp
@@ -54,9 +54,6 @@ namespace ops = paddle_mobile::operators;
 REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(relu, ops::ReluOp);
 #endif
@@ -69,6 +66,9 @@ REGISTER_OPERATOR_CL(relu, ops::ReluOp);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(sigmoid, ops::SigmoidOp);
+#endif
 #endif  // SIGMOID_OP

 #ifdef TANH_OP

--- a/src/operators/batchnorm_op.cpp
+++ b/src/operators/batchnorm_op.cpp
@@ -34,9 +34,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(batch_norm, ops::BatchNormOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -32,8 +32,7 @@ class BatchNormOp
 public:
  BatchNormOp(const string &type, const VariableNameMap &inputs,
              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs,
-              std::shared_ptr<framework::Scope> scope)
+              const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, BatchNormParam<DeviceType>,
                                      BatchNormKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/kernel/mali/feed_kernel.cpp
+++ b/src/operators/kernel/mali/feed_kernel.cpp
@@ -12,25 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "operators/kernel/feed_kernel.h"
+#ifdef BEAM_SEARCH_DECODE_OP

-namespace paddle_mobile {
-namespace operators {
+#pragma once

-template <>
-bool FeedKernel<GPU_MALI, float>::Init(FeedParam<GPU_MALI> *param) {
-  return true;
-}
+#include "operators/beam_search_decode_op.h"

-template <>
-void FeedKernel<GPU_MALI, float>::Compute(const FeedParam<GPU_MALI> &param) {
-  param.Out()->ShareDataWith(*(param.InputX()));
-  param.Out()->set_lod(param.InputX()->lod());
-}
+namespace paddle_mobile {
+namespace operators {

-template class FeedKernel<GPU_MALI, float>;
+template <typename Dtype, typename T>
+void BeamSearchDecodeOp<Dtype, T>::InferShape() const {}

 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(beam_search_decode, ops::BeamSearchDecodeOp);
 #endif
+
+#endif  // BEAM_SEARCH_DECODE_OP
--- a/src/operators/beam_search_decode_op.h
+++ b/src/operators/beam_search_decode_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BEAM_SEARCH_DECODE_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/beam_search_decode_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+DECLARE_OPERATOR(BeamSearchDecode, BeamSearchDecodeParam,
+                 BeamSearchDecodeKernel);
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // BEAM_SEARCH_DECODE_OP
--- a/src/operators/kernel/arm/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_kernel.cpp
@@ -11,27 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef FUSION_CONVADD_OP

-#include "operators/kernel/conv_add_kernel.h"
-#include "../central-arm-func/conv_add_arm_func.h"
+#ifdef BEAM_SEARCH_OP

-namespace paddle_mobile {
-namespace operators {
+#pragma once

-template <>
-bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
-  return true;
-}
+#include "operators/beam_search_op.h"

-template <>
-void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
-  ConvAddCompute<float>(param);
-}
+namespace paddle_mobile {
+namespace operators {

-template class ConvAddKernel<CPU, float>;
+template <typename Dtype, typename T>
+void BeamSearchOp<Dtype, T>::InferShape() const {}

 }  // namespace operators
 }  // namespace paddle_mobile

+namespace ops = paddle_mobile::operators;
+
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(beam_search, ops::BeamSearchOp);
 #endif
+
+#endif  // BEAM_SEARCH_OP
--- a/src/operators/beam_search_op.h
+++ b/src/operators/beam_search_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef BEAM_SEARCH_OP
+
+#pragma once
+
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/beam_search_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+DECLARE_OPERATOR(BeamSearch, BeamSearchParam, BeamSearchKernel);
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // BEAM_SEARCH_OP
--- a/src/operators/bilinear_interp_op.cpp
+++ b/src/operators/bilinear_interp_op.cpp
@@ -48,8 +48,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
@@ -34,8 +34,7 @@ class BilinearOp : public framework::OperatorWithKernel<
 public:
  BilinearOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, BilinearInterpParam<DeviceType>,
            operators::BilinearInterpKernel<DeviceType, T>>(

--- a/src/operators/box_coder_op.cpp
+++ b/src/operators/box_coder_op.cpp
@@ -58,8 +58,6 @@ REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(box_coder, ops::BoxCoderOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -34,8 +34,7 @@ class BoxCoderOp : public framework::OperatorWithKernel<
 public:
  BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs,
-             std::shared_ptr<framework::Scope> scope)
+             const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
                                      operators::BoxCoderKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/cast_op.h
+++ b/src/operators/cast_op.h
@@ -31,7 +31,7 @@ class CastOp : public framework::OperatorWithKernel<
 public:
  CastOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, CastParam<DeviceType>,
                                      operators::CastKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/concat_op.cpp
+++ b/src/operators/concat_op.cpp
@@ -69,9 +69,7 @@ REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(concat, ops::ConcatOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(concat, ops::ConcatOp);
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
 #endif

--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -30,7 +30,7 @@ class ConcatOp : public framework::OperatorWithKernel<
 public:
  ConcatOp(const string &type, const VariableNameMap &inputs,
           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           std::shared_ptr<framework::Scope> scope)
+           framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
                                      operators::ConcatKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/conv_op.cpp
+++ b/src/operators/conv_op.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);
@@ -55,9 +55,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(conv2d, ops::ConvOp);
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
 #endif

--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -30,7 +30,7 @@ class ConvOp : public framework::OperatorWithKernel<
 public:
  ConvOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
 #endif

--- a/src/operators/conv_transpose_op.h
+++ b/src/operators/conv_transpose_op.h
@@ -31,8 +31,7 @@ class ConvOpTranspose : public framework::OperatorWithKernel<
 public:
  ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ConvTransposeParam<DeviceType>,
            operators::ConvTransposeKernel<DeviceType, T>>(

--- a/src/operators/crf_op.cpp
+++ b/src/operators/crf_op.cpp
@@ -48,8 +48,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(crf_decoding, ops::CrfOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
@@ -33,7 +33,7 @@ class CrfOp : public framework::OperatorWithKernel<
 public:
  CrfOp(const std::string &type, const VariableNameMap &inputs,
        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        std::shared_ptr<framework::Scope> scope)
+        framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
                                      operators::CrfKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "framework/op_proto_maker.h"
 #include "framework/op_registry.h"
 #include "operators/conv_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -30,8 +30,7 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 public:
  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
                                      operators::ConvKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/dequantize_op.h
+++ b/src/operators/dequantize_op.h
@@ -32,8 +32,7 @@ class DequantizeOp
 public:
  DequantizeOp(const std::string &type, const VariableNameMap &inputs,
               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs,
-               std::shared_ptr<framework::Scope> scope)
+               const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
                                      DequantizeKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -34,7 +34,7 @@ class DropoutOp : public framework::OperatorWithKernel<
 public:
  DropoutOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs, const framework::AttributeMap attrs,
-            std::shared_ptr<framework::Scope> scope)
+            framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
                                      operators::DropoutKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -32,9 +32,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
-#endif

 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);

--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -32,7 +32,7 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseAddParam<DeviceType>,
            operators::ElementwiseAddKernel<DeviceType, T>>(

--- a/src/operators/elementwise_mul_op.cpp
+++ b/src/operators/elementwise_mul_op.cpp
@@ -32,9 +32,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
 #endif

--- a/src/operators/elementwise_mul_op.h
+++ b/src/operators/elementwise_mul_op.h
@@ -32,7 +32,7 @@ class ElementwiseMulOp : public framework::OperatorWithKernel<
  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseMulParam<DeviceType>,
            operators::ElementwiseMulKernel<DeviceType, T>>(

--- a/src/operators/elementwise_sub_op.cpp
+++ b/src/operators/elementwise_sub_op.cpp
@@ -32,9 +32,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(elementwise_sub, ops::ElementwiseSubOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif


--- a/src/operators/elementwise_sub_op.h
+++ b/src/operators/elementwise_sub_op.h
@@ -32,7 +32,7 @@ class ElementwiseSubOp : public framework::OperatorWithKernel<
  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
                   const VariableNameMap &outputs,
                   const framework::AttributeMap &attrs,
-                   std::shared_ptr<framework::Scope> scope)
+                   framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, ElementwiseSubParam<DeviceType>,
            operators::ElementwiseSubKernel<DeviceType, T>>(

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -21,7 +21,8 @@ template <typename DeviceType, typename T>
 void FeedOp<DeviceType, T>::InferShape() const {
  auto out_dims = this->param_.Out()->dims();
  out_dims[0] = this->param_.BatchSize();
-  auto input_dims = this->param_.InputX()->dims();
+  int col = this->param_.Col();
+  auto input_dims = this->param_.InputX()->at(col).dims();
  if (input_dims.size() == 4) {
    this->param_.Out()->Resize(input_dims);
  } else {
@@ -37,9 +38,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(feed, ops::FeedOp);
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
 #endif

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -31,7 +31,7 @@ class FeedOp
 public:
  FeedOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
-         std::shared_ptr<framework::Scope> scope)
+         framework::Scope *scope)

      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
                                      FeedKernel<DeviceType, T>>(

--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -18,8 +18,9 @@ namespace operators {

 template <typename DeviceType, typename T>
 void FetchOp<DeviceType, T>::InferShape() const {
+  int col = this->param_.Col();
  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
+  this->param_.Out()->at(col).Resize(x_dims);
 }

 }  // namespace operators
@@ -29,9 +30,7 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fetch, ops::FetchOp);
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
 #endif

--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -30,7 +30,7 @@ class FetchOp
 public:
  FetchOp(const string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const framework::AttributeMap attrs,
-          std::shared_ptr<framework::Scope> scope)
+          framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
                                      FetchKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fill_constant_op.h
+++ b/src/operators/fill_constant_op.h
@@ -31,11 +31,10 @@ class FillConstantOp : public framework::OperatorBase<DeviceType> {
 public:
  FillConstantOp(const std::string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap attrs, framework::Scope *scope)
      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
                                            scope),
-        param_(inputs, outputs, attrs, scope.get()) {}
+        param_(inputs, outputs, attrs, scope) {}
  void RunImpl() {
    auto data_type =
        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(

--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -49,8 +49,7 @@ class FlattenOp : public framework::OperatorWithKernel<
 public:
  FlattenOp(const std::string &type, const VariableNameMap &inputs,
            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs,
-            std::shared_ptr<framework::Scope> scope)
+            const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
                                      operators::FlattenKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_conv_add_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_add_prelu_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#include "operators/fusion_conv_add_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
-                        ops::FusionConvAddAddPReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
-#endif
-
-#endif  // FUSION_CONVADDADDPRELU_OP
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_add_prelu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvAddAddPReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddAddPReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_PRELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD,
-                   {{"Y", "Y"}, {"Out", "addOut"}, {"X", "addX"}}},
-                  {G_OP_TYPE_PRELU, {{"Alpha", "Alpha"}}}},
-
-                 removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU; }
-
-  std::vector<std::pair<int, std::string>> NeedCheck() {
-    DLOG << " conv add add prelu check add X ";
-    return {{2, "Y"}, {2, "X"}};
-  }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddAddPReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-          operators::ConvAddAddPReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddAddPReluOp(const string &type, const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddAddPReluParam<DeviceType>,
-            operators::ConvAddAddPReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBN_OP

 #include "operators/fusion_conv_add_bn_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_bn_op.h
+++ b/src/operators/fusion_conv_add_bn_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <vector>
 #include "framework/operator.h"
 #include "framework/program/program-optimize/fusion_op_register.h"
-#include "op_param.h"
 #include "operators/kernel/conv_add_bn_kernel.h"
+#include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
@@ -59,7 +59,7 @@ class FusionConvAddBNOp : public framework::OperatorWithKernel<
  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddBNParam<DeviceType>,
            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDBNRELU_OP

 #include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -61,7 +61,7 @@ class FusionConvAddBNReluOp
  FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddBNReluParam<DeviceType>,
            operators::ConvAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_add_op.cpp
+++ b/src/operators/fusion_conv_add_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP

 #include "operators/fusion_conv_add_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);
@@ -54,9 +54,6 @@ REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
-#endif

 #ifdef PADDLE_MOBILE_CL
 REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);

--- a/src/operators/fusion_conv_add_op.h
+++ b/src/operators/fusion_conv_add_op.h
@@ -50,8 +50,7 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 public:
  FusionConvAddOp(const string &type, const VariableNameMap &inputs,
                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs,
-                  std::shared_ptr<framework::Scope> scope)
+                  const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType,
                                      FusionConvAddParam<DeviceType>,
                                      operators::ConvAddKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_prelu_op.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDPRELU_OP
-
-#include "operators/fusion_conv_add_prelu_op.h"
-#include "operators/math/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
-                        ops::FusionConvAddPReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
-#endif
-
-#endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP

 #include "operators/fusion_conv_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }
  framework::DDim ddim = framework::make_ddim(output_shape);
  this->param_.Output()->Resize(ddim);

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -51,7 +51,7 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs,
                      const framework::AttributeMap &attrs,
-                      std::shared_ptr<framework::Scope> scope)
+                      framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvAddReluParam<DeviceType>,
            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNADDRELU_OP

 #include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -67,7 +67,7 @@ class FusionConvBNAddReluOp
  FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs,
                        const VariableNameMap &outputs,
                        const framework::AttributeMap &attrs,
-                        std::shared_ptr<framework::Scope> scope)
+                        framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvBNAddReluParam<DeviceType>,
            operators::ConvBNAddReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_conv_bn_op.cpp
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBN_OP

 #include "operators/fusion_conv_bn_op.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_op.h
+++ b/src/operators/fusion_conv_bn_op.h
@@ -56,8 +56,7 @@ class FusionConvBNOp : public framework::OperatorWithKernel<
 public:
  FusionConvBNOp(const string &type, const VariableNameMap &inputs,
                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs,
-                 std::shared_ptr<framework::Scope> scope)
+                 const framework::AttributeMap &attrs, framework::Scope *scope)
      : framework::OperatorWithKernel<DeviceType, FusionConvBNParam<DeviceType>,
                                      operators::ConvBNKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP

 #include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/math/conv_func.h"
+#include "operators/kernel/central-arm-func/conv_arm_func.h"

 namespace paddle_mobile {
 namespace operators {
@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {

  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(
-        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
-                             paddings[i], strides[i]));
+    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                          dilations[i], paddings[i],
+                                          strides[i]));
  }

  framework::DDim ddim = framework::make_ddim(output_shape);

--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -58,7 +58,7 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
                     const VariableNameMap &outputs,
                     const framework::AttributeMap &attrs,
-                     std::shared_ptr<framework::Scope> scope)
+                     framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionConvBNReluParam<DeviceType>,
            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_bn_op.cpp
+++ b/src/operators/fusion_deconv_add_bn_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
 #endif

--- a/src/operators/fusion_deconv_add_bn_op.h
+++ b/src/operators/fusion_deconv_add_bn_op.h
@@ -57,7 +57,7 @@ class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
  FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
                      const VariableNameMap &outputs,
                      const framework::AttributeMap &attrs,
-                      std::shared_ptr<framework::Scope> scope)
+                      framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddBNParam<DeviceType>,
            operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_deconv_add_bn_relu_op.cpp
@@ -25,8 +25,7 @@ REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
                        ops::FusionDeconvAddBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
 #endif

--- a/src/operators/fusion_deconv_add_bn_relu_op.h
+++ b/src/operators/fusion_deconv_add_bn_relu_op.h
@@ -59,7 +59,7 @@ class FusionDeconvAddBNReluOp
  FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
                          const VariableNameMap &outputs,
                          const framework::AttributeMap &attrs,
-                          std::shared_ptr<framework::Scope> scope)
+                          framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
            operators::DeconvAddBNReluKernel<DeviceType, T>>(

--- a/src/operators/fusion_deconv_add_op.cpp
+++ b/src/operators/fusion_deconv_add_op.cpp
@@ -24,8 +24,7 @@ namespace ops = paddle_mobile::operators;
 REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher);
 #ifdef PADDLE_MOBILE_CPU
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
+
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp);
 #endif

--- a/src/operators/fusion_deconv_add_op.h
+++ b/src/operators/fusion_deconv_add_op.h
@@ -49,7 +49,7 @@ class FusionDeconvAddOp : public framework::OperatorWithKernel<
  FusionDeconvAddOp(const string &type, const VariableNameMap &inputs,
                    const VariableNameMap &outputs,
                    const framework::AttributeMap &attrs,
-                    std::shared_ptr<framework::Scope> scope)
+                    framework::Scope *scope)
      : framework::OperatorWithKernel<
            DeviceType, FusionDeconvAddParam<DeviceType>,
            operators::DeconvAddKernel<DeviceType, T>>(type, inputs, outputs,

--- a/src/operators/fusion_deconv_add_relu_op.cpp
+++ b/src/operators/fusion_deconv_add_relu_op.cpp
--- a/src/operators/fusion_deconv_add_relu_op.h
+++ b/src/operators/fusion_deconv_add_relu_op.h
--- a/src/operators/fusion_deconv_bn_relu_op.cpp
+++ b/src/operators/fusion_deconv_bn_relu_op.cpp
--- a/src/operators/fusion_deconv_bn_relu_op.h
+++ b/src/operators/fusion_deconv_bn_relu_op.h
--- a/src/operators/fusion_deconv_relu_op.cpp
+++ b/src/operators/fusion_deconv_relu_op.cpp
--- a/src/operators/fusion_deconv_relu_op.h
+++ b/src/operators/fusion_deconv_relu_op.h
--- a/src/operators/fusion_dequant_add_bn_op.h
+++ b/src/operators/fusion_dequant_add_bn_op.h
--- a/src/operators/fusion_dequant_add_bn_relu_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_op.h
--- a/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ b/src/operators/fusion_dequant_add_bn_relu_quant_op.h
--- a/src/operators/fusion_dequant_bn_op.h
+++ b/src/operators/fusion_dequant_bn_op.h
--- a/src/operators/fusion_dequant_bn_relu_op.h
+++ b/src/operators/fusion_dequant_bn_relu_op.h
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
--- a/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
--- a/src/operators/fusion_elementwise_add_relu_op.h
+++ b/src/operators/fusion_elementwise_add_relu_op.h
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
--- a/src/operators/fusion_fc_relu_op.cpp
+++ b/src/operators/fusion_fc_relu_op.cpp
--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
--- a/src/operators/gru_unit_op.cpp
+++ b/src/operators/gru_unit_op.cpp
--- a/src/operators/gru_unit_op.h
+++ b/src/operators/gru_unit_op.h
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
--- a/src/operators/increment_op.cpp
+++ b/src/operators/increment_op.cpp
--- a/src/operators/increment_op.h
+++ b/src/operators/increment_op.h
--- a/src/operators/is_empty_op.cpp
+++ b/src/operators/is_empty_op.cpp
--- a/src/operators/is_empty_op.h
+++ b/src/operators/is_empty_op.h
--- a/src/operators/kernel/arm/activation_kernel.cpp
+++ b/src/operators/kernel/arm/activation_kernel.cpp
--- a/src/operators/kernel/arm/beam_search_decode_kernel.cpp
+++ b/src/operators/kernel/arm/beam_search_decode_kernel.cpp
--- a/src/operators/kernel/arm/beam_search_kernel.cpp
+++ b/src/operators/kernel/arm/beam_search_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+++ b/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_add_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/arm/conv_kernel.cpp
+++ b/src/operators/kernel/arm/conv_kernel.cpp
--- a/src/operators/kernel/arm/convolution/conv_common.h
+++ b/src/operators/kernel/arm/convolution/conv_common.h
--- a/src/operators/kernel/mali/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/mali/elementwise_add_kernel.cpp
--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
--- a/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/arm/dwconv_bn_relu_kernel.cpp
--- a/src/operators/kernel/mali/fetch_kernel.cpp
+++ b/src/operators/kernel/mali/fetch_kernel.cpp
--- a/src/operators/kernel/arm/feed_kernel.cpp
+++ b/src/operators/kernel/arm/feed_kernel.cpp
--- a/src/operators/kernel/arm/fetch_kernel.cpp
+++ b/src/operators/kernel/arm/fetch_kernel.cpp
--- a/src/operators/kernel/mali/reshape_kernel.cpp
+++ b/src/operators/kernel/mali/reshape_kernel.cpp
--- a/src/operators/kernel/arm/pad2d_kernel.cpp
+++ b/src/operators/kernel/arm/pad2d_kernel.cpp
--- a/src/operators/kernel/arm/scale_kernel.cpp
+++ b/src/operators/kernel/arm/scale_kernel.cpp
--- a/src/operators/kernel/arm/sequence_expand_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_expand_kernel.cpp
--- a/src/operators/kernel/arm/sequence_softmax_kernel.cpp
+++ b/src/operators/kernel/arm/sequence_softmax_kernel.cpp
--- a/src/operators/kernel/arm/softmax_kernel.cpp
+++ b/src/operators/kernel/arm/softmax_kernel.cpp
--- a/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+++ b/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
--- a/src/operators/kernel/arm/while_kernel.cpp
+++ b/src/operators/kernel/arm/while_kernel.cpp
--- a/src/operators/kernel/conv_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_prelu_kernel.h
--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
--- a/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
--- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
--- a/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+++ b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
--- a/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+++ b/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
--- a/src/operators/kernel/central-arm-func/increment_arm_func.h
+++ b/src/operators/kernel/central-arm-func/increment_arm_func.h
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
--- a/src/operators/kernel/cl/cl_kernel/sigmoid.cl
+++ b/src/operators/kernel/cl/cl_kernel/sigmoid.cl
--- a/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
--- a/src/operators/kernel/cl/fetch_kernel.cpp
+++ b/src/operators/kernel/cl/fetch_kernel.cpp
--- a/src/operators/kernel/cl/reshape_kernel.cpp
+++ b/src/operators/kernel/cl/reshape_kernel.cpp
--- a/src/operators/kernel/cl/sigmoid_kernel.cpp
+++ b/src/operators/kernel/cl/sigmoid_kernel.cpp
--- a/src/operators/kernel/conv_add_bn_kernel.h
+++ b/src/operators/kernel/conv_add_bn_kernel.h
--- a/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_add_bn_relu_kernel.h
--- a/src/operators/kernel/conv_add_kernel.h
+++ b/src/operators/kernel/conv_add_kernel.h
--- a/src/operators/kernel/conv_add_relu_kernel.h
+++ b/src/operators/kernel/conv_add_relu_kernel.h
--- a/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_add_relu_kernel.h
--- a/src/operators/kernel/conv_bn_kernel.h
+++ b/src/operators/kernel/conv_bn_kernel.h
--- a/src/operators/kernel/conv_bn_relu_kernel.h
+++ b/src/operators/kernel/conv_bn_relu_kernel.h
--- a/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ b/src/operators/kernel/dwconv_bn_relu_kernel.h
--- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_kernel.cpp
--- a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
--- a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
--- a/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
--- a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
--- a/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp
--- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp
--- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
--- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp
--- a/src/operators/kernel/fpga/V1/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/split_kernel.cpp
--- a/src/operators/kernel/fpga/V1/tanh_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/tanh_kernel.cpp
--- a/src/operators/kernel/lrn_kernel.h
+++ b/src/operators/kernel/lrn_kernel.h
--- a/ACL_Android @ 591027fc
+++ b/ACL_Android @ 591027fc
--- a/src/operators/kernel/mali/acl_operator.cc
+++ b/src/operators/kernel/mali/acl_operator.cc
--- a/src/operators/kernel/mali/acl_operator.h
+++ b/src/operators/kernel/mali/acl_operator.h
--- a/src/operators/kernel/mali/acl_tensor.cc
+++ b/src/operators/kernel/mali/acl_tensor.cc
--- a/src/operators/kernel/mali/acl_tensor.h
+++ b/src/operators/kernel/mali/acl_tensor.h
--- a/src/operators/kernel/mali/batchnorm_kernel.cpp
+++ b/src/operators/kernel/mali/batchnorm_kernel.cpp
--- a/src/operators/kernel/mali/concat_kernel.cpp
+++ b/src/operators/kernel/mali/concat_kernel.cpp
--- a/src/operators/kernel/mali/conv_add_kernel.cpp
+++ b/src/operators/kernel/mali/conv_add_kernel.cpp
--- a/src/operators/kernel/mali/conv_kernel.cpp
+++ b/src/operators/kernel/mali/conv_kernel.cpp
--- a/src/operators/kernel/mali/fushion_fc_kernel.cpp
+++ b/src/operators/kernel/mali/fushion_fc_kernel.cpp
--- a/src/operators/kernel/mali/lrn_kernel.cpp
+++ b/src/operators/kernel/mali/lrn_kernel.cpp
--- a/src/operators/kernel/mali/pool_kernel.cpp
+++ b/src/operators/kernel/mali/pool_kernel.cpp
--- a/src/operators/kernel/mali/relu_kernel.cpp
+++ b/src/operators/kernel/mali/relu_kernel.cpp
--- a/src/operators/kernel/mali/softmax_kernel.cpp
+++ b/src/operators/kernel/mali/softmax_kernel.cpp
--- a/src/operators/kernel/conv_add_add_prelu_kernel.h
+++ b/src/operators/kernel/conv_add_add_prelu_kernel.h
--- a/src/operators/kernel/pad2d_kernel.h
+++ b/src/operators/kernel/pad2d_kernel.h
--- a/src/operators/kernel/prior_box_kernel.h
+++ b/src/operators/kernel/prior_box_kernel.h
--- a/src/operators/kernel/while_kernel.h
+++ b/src/operators/kernel/while_kernel.h
--- a/src/operators/lookup_op.cpp
+++ b/src/operators/lookup_op.cpp
--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
--- a/src/operators/math/activation.h
+++ b/src/operators/math/activation.h
--- a/src/operators/math/channel_wise.h
+++ b/src/operators/math/channel_wise.h
--- a/src/operators/math/conv_func.h
+++ b/src/operators/math/conv_func.h
--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
--- a/src/operators/math/depthwise_conv3x3.h
+++ b/src/operators/math/depthwise_conv3x3.h
--- a/src/operators/math/depthwise_conv3x3_int8.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8.cpp
--- a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
--- a/src/operators/math/depthwise_conv5x5.cpp
+++ b/src/operators/math/depthwise_conv5x5.cpp
--- a/src/operators/math/depthwise_conv5x5.h
+++ b/src/operators/math/depthwise_conv5x5.h
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
--- a/src/operators/kernel/mali/mul_kernel.cpp
+++ b/src/operators/kernel/mali/mul_kernel.cpp
--- a/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_prelu_kernel.cpp
--- a/src/operators/math/gemm/executor.h
+++ b/src/operators/math/gemm/executor.h
--- a/src/operators/math/gemm/gemm_kernel.h
+++ b/src/operators/math/gemm/gemm_kernel.h
--- a/src/operators/math/gemm/pack_kernel.h
+++ b/src/operators/math/gemm/pack_kernel.h
--- a/src/operators/math/gemm/strategy.h
+++ b/src/operators/math/gemm/strategy.h
--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
--- a/src/operators/math/im2col.cpp
+++ b/src/operators/math/im2col.cpp
--- a/src/operators/math/im2col.h
+++ b/src/operators/math/im2col.h
--- a/src/operators/math/math_func_neon.h
+++ b/src/operators/math/math_func_neon.h
--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
--- a/src/operators/math/softmax.cpp
+++ b/src/operators/math/softmax.cpp
--- a/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3.cpp
--- a/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
--- a/src/operators/norm_op.cpp
+++ b/src/operators/norm_op.cpp
--- a/src/operators/norm_op.h
+++ b/src/operators/norm_op.h
--- a/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
--- a/src/operators/one_hot_op.h
+++ b/src/operators/one_hot_op.h
--- a/src/operators/op_param.cpp
+++ b/src/operators/op_param.cpp
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
--- a/src/operators/pad2d_op.cpp
+++ b/src/operators/pad2d_op.cpp
--- a/src/operators/pad2d_op.h
+++ b/src/operators/pad2d_op.h
--- a/src/operators/polygon_box_transform_op.h
+++ b/src/operators/polygon_box_transform_op.h
--- a/src/operators/pool_op.cpp
+++ b/src/operators/pool_op.cpp
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
--- a/src/operators/quantize_op.h
+++ b/src/operators/quantize_op.h
--- a/src/operators/reshape2_op.cpp
+++ b/src/operators/reshape2_op.cpp
--- a/src/operators/reshape2_op.h
+++ b/src/operators/reshape2_op.h
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
--- a/src/operators/resize_op.h
+++ b/src/operators/resize_op.h
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
--- a/src/operators/scale_op.h
+++ b/src/operators/scale_op.h
--- a/src/operators/sequence_ops/sequence_expand_op.h
+++ b/src/operators/sequence_ops/sequence_expand_op.h
--- a/src/operators/sequence_ops/sequence_pool_op.h
+++ b/src/operators/sequence_ops/sequence_pool_op.h
--- a/src/operators/sequence_ops/sequence_softmax_op.h
+++ b/src/operators/sequence_ops/sequence_softmax_op.h
--- a/src/operators/shape_op.cpp
+++ b/src/operators/shape_op.cpp
--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
--- a/src/operators/slice_op.h
+++ b/src/operators/slice_op.h
--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
--- a/src/operators/sum_op.cpp
+++ b/src/operators/sum_op.cpp
--- a/src/operators/sum_op.h
+++ b/src/operators/sum_op.h
--- a/src/operators/top_k_op.cpp
+++ b/src/operators/top_k_op.cpp
--- a/src/operators/top_k_op.h
+++ b/src/operators/top_k_op.h
--- a/src/operators/transpose2_op.h
+++ b/src/operators/transpose2_op.h
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
--- a/test/executor_for_test.h
+++ b/test/executor_for_test.h
--- a/test/fpga/test_marker_api.cpp
+++ b/test/fpga/test_marker_api.cpp
--- a/test/fpga/test_mobilenet_api.cpp
+++ b/test/fpga/test_mobilenet_api.cpp
--- a/test/fpga/test_rfcn_api.cpp
+++ b/test/fpga/test_rfcn_api.cpp
--- a/test/fpga/test_yolo_api.cpp
+++ b/test/fpga/test_yolo_api.cpp
--- a/test/net/test_benchmark.cpp
+++ b/test/net/test_benchmark.cpp
--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
--- a/test/net/test_ocr.cpp
+++ b/test/net/test_ocr.cpp
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
--- a/test/operators/test_cast_op.cpp
+++ b/test/operators/test_cast_op.cpp
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
--- a/test/operators/test_conv_bn_relu_op.cpp
+++ b/test/operators/test_conv_bn_relu_op.cpp
--- a/test/operators/test_conv_op.cpp
+++ b/test/operators/test_conv_op.cpp
--- a/test/operators/test_dequantize_op.cpp
+++ b/test/operators/test_dequantize_op.cpp
--- a/test/operators/test_dwconv_bn_relu_op.cpp
+++ b/test/operators/test_dwconv_bn_relu_op.cpp
--- a/test/operators/test_elementwise_sub_op.cpp
+++ b/test/operators/test_elementwise_sub_op.cpp
--- a/test/operators/test_fill_constant_op.cpp
+++ b/test/operators/test_fill_constant_op.cpp
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
--- a/test/operators/test_increment_op.cpp
+++ b/test/operators/test_increment_op.cpp
--- a/test/operators/test_is_empty_op.cpp
+++ b/test/operators/test_is_empty_op.cpp
--- a/test/operators/test_less_than_op.cpp
+++ b/test/operators/test_less_than_op.cpp
--- a/test/operators/test_log_op.cpp
+++ b/test/operators/test_log_op.cpp
--- a/test/operators/test_logical_and_op.cpp
+++ b/test/operators/test_logical_and_op.cpp
--- a/test/operators/test_logical_not_op.cpp
+++ b/test/operators/test_logical_not_op.cpp
--- a/test/operators/test_logical_or_op.cpp
+++ b/test/operators/test_logical_or_op.cpp
--- a/test/operators/test_logical_xor_op.cpp
+++ b/test/operators/test_logical_xor_op.cpp
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
--- a/test/operators/test_polygon_box_transform_op.cpp
+++ b/test/operators/test_polygon_box_transform_op.cpp
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
--- a/test/operators/test_relu6_op.cpp
+++ b/test/operators/test_relu6_op.cpp
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
--- a/test/operators/test_reshape2_op.cpp
+++ b/test/operators/test_reshape2_op.cpp
--- a/test/operators/test_sequence_expand_op.cpp
+++ b/test/operators/test_sequence_expand_op.cpp
--- a/test/operators/test_sequence_pool_op.cpp
+++ b/test/operators/test_sequence_pool_op.cpp
--- a/test/operators/test_sequence_softmax_op.cpp
+++ b/test/operators/test_sequence_softmax_op.cpp
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
--- a/test/operators/test_sum_op.cpp
+++ b/test/operators/test_sum_op.cpp
--- a/test/operators/test_tanh_op.cpp
+++ b/test/operators/test_tanh_op.cpp
--- a/test/operators/test_topk_op.cpp
+++ b/test/operators/test_topk_op.cpp
--- a/test/operators/test_transpose2_op.cpp
+++ b/test/operators/test_transpose2_op.cpp
--- a/test/test_helper.h
+++ b/test/test_helper.h
--- a/tools/build.sh
+++ b/tools/build.sh
--- a/tools/ci_build.sh
+++ b/tools/ci_build.sh
--- a/tools/op.cmake
+++ b/tools/op.cmake