Refine: fix sigmoid and nlp unit test

58c809e8 · hjchen2 · 3c7cde0c · f9a22863 · 58c809e8 · 58c809e8
58 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-cmake_minimum_required(VERSION 3.6)
+cmake_minimum_required(VERSION 3.0)
 option(USE_OPENMP "openmp support" OFF)

 project(paddle-mobile)
@@ -30,6 +30,7 @@ else()
    set(CMAKE_BUILD_TYPE Release)
    set(CMAKE_CXX_FLAGS "-Os ${CMAKE_CXX_FLAGS}")
    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
+    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
 endif()

 if(USE_EXCEPTION)

--- a/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+++ b/metal/paddle-mobile/paddle-mobile/PaddleMobile.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Metal
+import MetalKit
+import Foundation
+
+@objc public enum Platform: Int{
+  case CPU, GPU
+}
+
+class ScaleKernel: CusomKernel {
+  init(device: MTLDevice, shape: Shape) {
+    if computePrecision == .Float32 {
+      super.init(device: device, inFunctionName: "scale", outputDim: shape, usePaddleMobileLib: false)
+    } else if computePrecision == .Float16 {
+      super.init(device: device, inFunctionName: "scale_half", outputDim: shape, usePaddleMobileLib: false)
+    } else {
+      fatalError(" unsupport ")
+    }
+  }
+  
+}
+
+public class Runner: NSObject {
+  var program: Program?
+  var executor: Executor<Float32>?
+  var queue: MTLCommandQueue?
+  var textureLoader: MTKTextureLoader?
+  public let net: Net
+  let device: MTLDevice?
+  let platform: Platform
+  var cpuPaddleMobile: PaddleMobileCPU?
+  let numel: Int
+  let meansNumber: [NSNumber]
+  
+  // dims num nchw
+  let dimsNum: [NSNumber]
+  /**
+   * inNet:        需要运行的网络
+   * commandQueue: GPU 是需要传入
+   * inPlatform:   需要使用的平台, GPU or CPU
+   */
+  @objc public init(inNet: Net, commandQueue: MTLCommandQueue?, inPlatform: Platform) {
+    net = inNet
+    queue = commandQueue
+    device = queue?.device
+    platform = inPlatform
+    if let inDevice = device {
+      textureLoader = MTKTextureLoader.init(device: inDevice)
+    }
+    if platform == .CPU {
+      cpuPaddleMobile = PaddleMobileCPU.init()
+    }
+    numel = net.dim.n * net.dim.c * net.dim.h * net.dim.w
+    meansNumber = net.means.map { NSNumber.init(value: $0) }
+    dimsNum = [NSNumber.init(value: net.dim.n),
+               NSNumber.init(value: net.dim.c),
+               NSNumber.init(value: net.dim.h),
+               NSNumber.init(value: net.dim.w)]
+  }
+  
+  /**
+   * load 模型, 返回 true 可进行预测
+   */
+  @objc public func load() -> Bool {
+    if platform == .GPU {
+      guard let inDevice = device, let inQueue = queue else {
+        print(" paddle mobile gpu load error, need MTLCommandQueue")
+        return false
+      }
+      let loader = Loader<Float32>.init()
+      do {
+//        program = try loader.load(device: inDevice, paramPointer: net.paramPointer!, paramSize: net.paramSize,modePointer:net.modelPointer!,modelSize:net.modelSize)
+        program = try loader.load(device: inDevice, modelPath: net.modelPath, paraPath: net.paramPath)
+        net.updateProgram(program: program!)
+
+        executor = try Executor<Float32>.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!)
+      } catch let error {
+        print(error)
+        return false
+      }
+    } else {
+      return cpuPaddleMobile?.load(net.modelPath, andWeightsPath: net.paramPath) ?? false
+    }
+    return true
+  }
+  
+  @objc public func predict(inputPointer: UnsafeMutablePointer<Float32>, completion: @escaping ( _ success: Bool, _ result: PaddleMobileCPUResult?) -> Void) {
+    
+    guard let res = cpuPaddleMobile?.predictInput(inputPointer, dim: dimsNum) else {
+      completion(false, nil)
+      return
+    }
+    completion(true, res)
+  }
+  
+  /**
+   * GPU 版本 predict
+   * texture: 需要预测的 texture 需要做过预处理
+   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
+   */
+  @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: ResultHolder?) -> Void) {
+    do {
+      try self.executor?.predict(input: texture, dim: [self.net.dim.n, self.net.dim.h, self.net.dim.w, self.net.dim.c], completionHandle: { [weak self] (res) in
+        guard let SSelf = self else {
+          fatalError( " self nil " )
+        }
+        let result = SSelf.net.fetchResult(paddleMobileRes: res)
+        completion(true, result)
+      }, preProcessKernle: self.net.preprocessKernel, except: self.net.except)
+    } catch let error {
+      print(error)
+      completion(false, nil)
+      return
+    }
+  }
+  
+  /**
+   * CPU GPU 通用版本 predict
+   * cgImage: 需要预测的图片
+   * ( _ success: Bool, _ time:TimeInterval, _ resultArray: [Float32]) -> Void : 回调闭包, 三个参数分别为: 是否成功, 预测耗时, 结果数组
+   */
+//  @objc public func predict(cgImage: CGImage, completion: @escaping ( _ success: Bool, _ resultArray: [Float32]) -> Void) {
+//    if platform == .GPU {
+//      getTexture(image: cgImage) { [weak self] (texture) in
+//        guard let SSelf = self else {
+//          fatalError( "" )
+//        }
+//        SSelf.predict(texture: texture, completion: completion)
+//      }
+//    } else if platform == .CPU {
+//      let input = preproccess(image: cgImage)
+//      predict(inputPointer: input, completion: completion)
+//      input.deinitialize(count: numel)
+//      input.deallocate()
+//    }
+//  }
+  
+  /*
+   * 清理内存, 调用此函数后, 不能再使用, 需重新 load
+   */
+  @objc public func clear() {
+    if platform == .GPU {
+      executor?.clear()
+      executor = nil
+      program = nil
+    } else if platform == .CPU {
+      cpuPaddleMobile?.clear()
+    }
+  }
+  
+  @objc public func preproccess(image: CGImage) -> UnsafeMutablePointer<Float> {
+    let output = UnsafeMutablePointer<Float>.allocate(capacity: numel)
+    let means = net.means.map { NSNumber.init(value: $0) }
+    let dims = [NSNumber.init(value: net.dim.n),
+                NSNumber.init(value: net.dim.c),
+                NSNumber.init(value: net.dim.h),
+                NSNumber.init(value: net.dim.w)]
+    cpuPaddleMobile?.preprocess(image, output: output, means: means, scale: net.scale, dim: dims)
+    return output
+  }
+  
+  /*
+   * 获取 texture, 对 texture 进行预处理, GPU 预测时使用
+   */
+  @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) {
+    let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error"
+    scaleTexture(input: texture!, complete: getTexture)
+  }
+  
+  public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
+    
+    guard let inQueue = queue, let inDevice = device else {
+      fatalError( " queue or devcie nil " )
+    }
+    
+    guard let buffer = inQueue.makeCommandBuffer() else {
+      fatalError( " make buffer error" )
+    }
+    
+    let scaleKernel = ScaleKernel.init(device: inDevice, shape: CusomKernel.Shape.init(inWidth: net.dim.w, inHeight: net.dim.h, inChannel: 3))
+    
+    do {
+      try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
+    } catch let error {
+      print(error)
+      fatalError()
+    }
+    
+    buffer.addCompletedHandler { (buffer) in
+      complete(scaleKernel.outputTexture)
+    }
+    buffer.commit()
+  }
+}
+
+
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -218,6 +218,8 @@ int get_aligned_filter_num(int num) {

 void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
+  filter_tensor->scale[0] = float(max_value / 127.0);
+  filter_tensor->scale[1] = float(127.0 / max_value);
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->mutable_data<float>();

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -393,6 +393,77 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
  return result_vector;
 }

+#ifdef PADDLE_MOBILE_FPGA
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
+                                        string var_name) {
+  framework::Variable *g_feed_value = program_.scope->Var(var_name);
+  framework::Tensor *feed_tensor =
+      g_feed_value->GetMutable<framework::LoDTensor>();
+  feed_tensor->Resize(t.dims());
+  feed_tensor->ShareDataWith(t);
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
+  InjectVariable(t, "feed");
+};
+
+template <typename Dtype, Precision P>
+std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+
+  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
+  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
+  auto output_map = op->Outputs();
+  std::vector<std::string> out_keys = op->GetOutKeys();
+  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
+  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
+      out_keys[0], output_map, *(program_.scope));
+  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From_To(int start, int end) {
+  std::shared_ptr<framework::BlockDesc> to_predict_block =
+      to_predict_program_->Block(0);
+  auto &ops = ops_of_block_[*to_predict_block.get()];
+  end = end < 0 ? (int)ops.size() : end;
+  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
+                        "start or end parameter is wrong");
+
+#ifdef PADDLE_MOBILE_PROFILE
+  std::vector<ProfInfo> profile(ops.size());
+#endif
+  for (int i = start; i < end; i++) {
+#ifdef PADDLE_MOBILE_PROFILE
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+    DLOG << "Running op: " << i << "  " << ops[i]->Type();
+    ops[i]->Run();
+
+#ifdef PADDLE_MOBILE_PROFILE
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+#endif
+  }
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_From(int start) {
+  Predict_From_To(start);
+};
+
+template <typename Dtype, Precision P>
+void Executor<Dtype, P>::Predict_To(int end) {
+  Predict_From_To(0, end);
+};
+#endif
+
 template class Executor<CPU, Precision::FP32>;
 template class Executor<GPU_MALI, Precision::FP32>;
 template class Executor<FPGA, Precision::FP32>;

--- a/src/jni/PML.java
+++ b/src/jni/PML.java
@@ -9,6 +9,14 @@ public class PML {
     */
    public static native boolean load(String modelDir);

+    /**
+     * load seperated model
+     *
+     * @param modelDir model dir
+     * @return isloadsuccess
+     */
+    public static native boolean loadnlp(String modelDir);
+
    /**
     * load combined model
     *

--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -74,6 +74,28 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
  return static_cast<jboolean>(isLoadOk);
 }

+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_PML_loadnlp(JNIEnv *env, jclass thiz, jstring modelPath) {
+  std::lock_guard<std::mutex> lock(shared_mutex);
+  ANDROIDLOGI("load invoked");
+  bool optimize = true;
+  bool isLoadOk = false;
+
+#ifdef ENABLE_EXCEPTION
+  try {
+    isLoadOk = getPaddleMobileInstance()->Load(
+        jstring2cppstring(env, modelPath), optimize, false, true);
+  } catch (paddle_mobile::PaddleMobileException &e) {
+    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
+    isLoadOk = false;
+  }
+#else
+  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                             optimize, false, true);
+#endif
+  return static_cast<jboolean>(isLoadOk);
+}
+
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
    JNIEnv *env, jclass thiz, jstring modelPath) {
  std::lock_guard<std::mutex> lock(shared_mutex);

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -45,8 +45,8 @@ class FeedOp : public framework::OperatorBase<DeviceType> {

  void RunImpl() const {
    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());
-    auto input_ptr = input->data<float>();
    fpga::format_image(input);
+    auto input_ptr = input->data<float>();
    Tensor *output = param_.Out();
    auto output_ptr = output->mutable_data<half>();


--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -47,7 +47,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
  concatArgs.image_num = (uint32_t)image_num;
  concatArgs.images_in = images_in;
  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->mutable_data<float>();
+  concatArgs.image_out = (half *)out->data<float>();
  concatArgs.scale_out = out->scale;
  concatArgs.channel_num = channel_num;
  concatArgs.height = (uint32_t)height;

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -24,19 +24,24 @@ namespace operators {

 template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
-  const Tensor *input = param->InputX();
-
+  auto input = const_cast<Tensor *>(param->InputX());
  auto input_ptr = input->data<float>();
-  auto output = param->Out();
-  auto output_ptr = output->mutable_data<float>();
-  fpga::BypassArgs args;
-  args.convert_type = fpga::DATA_FP16_TO_FP32;
-  args.layout_type = fpga::LAYOUT_NO_CONVERT;
-  args.image.address = (void *)(input_ptr);
-  args.image.height = (uint32_t)input->dims()[0];
-  args.image.width = (uint32_t)input->dims()[1];
-  args.image.channels = 1;
-  args.output.address = output_ptr;
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>(input->dims());
+  fpga::format_fp32_ofm(float_input);
+
+  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
+  args.input_layout_type = fpga::LAYOUT_HWC;
+  args.output_layout_type = fpga::LAYOUT_CHW;
+  args.input_data_type = fpga::DATA_TYPE_FP16;
+  args.output_data_type = fpga::DATA_TYPE_FP32;
+  args.image.address = input_ptr;
+  args.image.height = 1;
+  args.image.width = 1;
+  args.image.channels = (uint32_t)input->dims()[1];
+  args.output.address = float_input->data<float>();
+  args.output.scale_address = float_input->scale;
+  param->SetFloatInput(float_input);
  param->SetFpgaArgs(args);

  return true;

--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -18,6 +18,7 @@ limitations under the License. */

 namespace paddle_mobile {
 namespace operators {
+
 template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
  this->param_.Out()->Resize(this->param_.InputX()->dims());

--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -17,13 +17,13 @@ limitations under the License. */
 #pragma once

 #include <string>
-
 #include "framework/operator.h"
 #include "operators/kernel/sigmoid_kernel.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {
+
 template <typename DeviceType, typename T>
 class SigmoidOp : public framework::OperatorWithKernel<
                      DeviceType, SigmoidParam<DeviceType>,
@@ -43,6 +43,7 @@ class SigmoidOp : public framework::OperatorWithKernel<

  void InferShape() const override;
 };
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
 set(dir ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
+set(FOUND_MATCH OFF)

-if ("googlenet" IN_LIST NET)
+set(CON -1)
+
+message(STATUS "nets :${NET}")
+
+list(FIND NET "googlenet" CON)
+if (CON GREATER -1)
    # gen test
-    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-googlenet paddle-mobile)
-elseif ("mobilenet" IN_LIST NET)
+
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "mobilenet" CON)
+if (CON GREATER -1)
    # gen test
-    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-mobilenet paddle-mobile)

    # gen test
-    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-mobilenet-combine paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()

-elseif ("yolo" IN_LIST NET)
+list(FIND NET "yolo" CON)
+if (CON GREATER -1)
    # gen test
-    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-yolo paddle-mobile)
-elseif ("squeezenet" IN_LIST NET)
    # gen test
-    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test_yolo_combined paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "squeezenet" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-squeezenet paddle-mobile)
-elseif("resnet" IN_LIST NET)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "resnet" CON)
+if (CON GREATER -1)
    # gen test
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)
-elseif("FPGAnets" IN_LIST NET)
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "FPGAnets" CON)
+if (CON GREATER -1)
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)

-    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-EW paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-conv paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-pooling paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-bypass paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-softmax paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-concat paddle-mobile)
+
+    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-tensor-quant paddle-mobile)

-    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-fpga-concat-op paddle-mobile)

-    ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h test_include.h)
    target_link_libraries(test-format-data paddle-mobile)
-elseif("mobilenetssd" IN_LIST NET)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "mobilenetssd" CON)
+if (CON GREATER -1)
    # gen test
-    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-mobilenetssd paddle-mobile)

-elseif("nlp" IN_LIST NET)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "nlp" CON)
+if (CON GREATER -1)
    # gen test
-    ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-nlp paddle-mobile)

    # gen test
-    ADD_EXECUTABLE(test-gru-op  operators/test_gru_op.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-gru-op paddle-mobile)
-else ()
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "mobilenetfssd" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
+    target_link_libraries(test-fssd paddle-mobile)
+
+    set(FOUND_MATCH ON)
+
+endif ()
+
+list(FIND NET "genet" CON)
+if (CON GREATER -1)
+    # gen test
+    ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-genet paddle-mobile)
+    set(FOUND_MATCH ON)
+
+endif ()
+
+if (NOT FOUND_MATCH)
+    # gen test
+    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-squeezenet paddle-mobile)
+
    # gen test
-    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h  test_include.h executor_for_test.h)
+    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-yolo paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test_yolo_combined paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-googlenet paddle-mobile)
-endif()
+
+    # gen test
+    ADD_EXECUTABLE(test-conv-op operators/test_cov_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-mul-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-elementwiseadd-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-concat-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-lrn-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-batchnorm-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-priorbox-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-boxcoder-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-transpose-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-multiclassnms-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-reshape-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-relu-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-fc-op paddle-mobile)
+
+    # gen test log
+    ADD_EXECUTABLE(test-log common/test_log.cpp)
+    target_link_libraries(test-log paddle-mobile)
+
+    # gen test log
+    ADD_EXECUTABLE(test-load framework/test_load.cpp)
+    target_link_libraries(test-load paddle-mobile)
+
+    # gen test log
+    ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
+    target_link_libraries(test-loadmemory paddle-mobile)
+
+    ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
+    target_link_libraries(test-inference-api paddle-mobile)
+
+
+    # gen test log
+    # gen test
+    ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
+    target_link_libraries(test-optimize paddle-mobile)
+
+
+    #gen test
+    ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-pool paddle-mobile)
+
+    #gen test
+    ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-softmax paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
+    target_link_libraries(test-gemm-accuracy paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
+    target_link_libraries(test-gemm-perf paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
+    target_link_libraries(test-enforce paddle-mobile)
+
+    # gen test - test if openmp works
+    ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-openmp paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenetssd paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet-combine paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-genet paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
+    target_link_libraries(test-sigmoid paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-depthwise-conv-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-mobilenet paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-relu-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-nlp paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-gru-op paddle-mobile)
+
+    # gen test
+
+    ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-inceptionv4 paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-alexnet paddle-mobile)
+
+    ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
+    target_link_libraries(test-googlenetv1 paddle-mobile)
+
+    # gen test
+    ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
+    target_link_libraries(test-fssd paddle-mobile)
+
+
+    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
+
+
+endif ()
--- a/test/fpga/test_format_data.cpp
+++ b/test/fpga/test_format_data.cpp
@@ -22,7 +22,7 @@ namespace fpga = paddle_mobile::fpga;
 using std::cout;
 using std::endl;

-int main() {
+void test_format_image() {
  std::vector<int> dims{1, 1, 3, 3};
  std::vector<float> elements{1, 2, 3, 4, 5, 6, 7, 8, 9};
  frame::DDim ddim = frame::make_ddim(dims);
@@ -44,6 +44,50 @@ int main() {
  cout << endl;
  auto dd = image.dims();
  cout << dims[0] << dims[1] << dims[2] << dims[3] << endl;
+}
+
+void test_fill_conv_arg() {
+  Tensor input, out, filter;
+  DLOG << "Setup input";
+  SetupTensor<int16_t>(&input, {1, 250, 32, 30}, static_cast<int16_t>(0),
+                       static_cast<int16_t>(1));
+
+  DLOG << "Setup filter";
+  SetupTensor<float>(&filter, {1001, 250, 3, 3}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  DLOG << "Setup output";
+  SetupTensor<int16_t>(&out, {1, 1001, 32, 30}, static_cast<int16_t>(0),
+                       static_cast<int16_t>(1));
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float));
+
+  DLOG << "find max";
+  float max_value = fpga::filter_find_max(&filter);
+  DLOG << "format filter";
+  fpga::format_filter(&filter, max_value, 1);
+
+  DLOG << "format bs_ptr";
+  int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1);
+  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001);

+  DLOG << "format ofm";
+  fpga::format_fp16_ofm(&out);
+  DLOG << "Build arg";
+
+  fpga::WrapperConvArgs arg;
+  fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr);
+  DLOG << "splitNum: " << arg.split_num << "  group_num:" << arg.group_num
+       << "  filter_num:" << arg.filter_num;
+
+  for (int i = 0; i < arg.split_num; i++) {
+    DLOG << arg.conv_args[i].filter_num << "   " << arg.conv_args[i].sb_address
+         << "   " << arg.conv_args[i].filter_address << "   "
+         << arg.conv_args[i].filter_scale_address;
+  }
+}
+
+int main() {
+  test_format_image();
+  test_fill_conv_arg();
  return 0;
 }
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+static const char *g_resnet_combine = "../models/resnet50";
+
+int main() {
+  DLOG << paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+                         std::string(g_resnet_combine) + "/params", true)) {
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+    //    paddle_mobile.Predict_From(73);
+    //    paddle_mobile.Predict_From_To(72, 73);
+
+    DLOG << "Computation done";
+    return 0;
+  }
+}
--- a/test/framework/test_load.cpp
+++ b/test/framework/test_load.cpp
@@ -21,6 +21,7 @@ int main() {
  paddle_mobile::Loader<paddle_mobile::CPU> loader;
  //  ../../../test/models/googlenet
  //  ../../../test/models/mobilenet
+
  //  auto program = loader.Load(g_googlenet, true);
  //  auto program = loader.Load(g_mobilenet_ssd, true);


--- a/test/net/test_alexnet.cpp
+++ b/test/net/test_alexnet.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_alexnet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
--- a/test/net/test_genet_combine.cpp
+++ b/test/net/test_genet_combine.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_genet_combine) + "/model",
+                         std::string(g_genet_combine) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 128, 128};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    // 预热一次
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  std::cout
+      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
+      << std::endl;
+  return 0;
+}
--- a/test/net/test_googlenet.cpp
+++ b/test/net/test_googlenet.cpp
@@ -17,7 +17,14 @@ limitations under the License. */
 #include "../test_include.h"

 int main() {
+#ifdef PADDLE_MOBILE_FPGA
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+#endif
+
+#ifdef PADDLE_MOBILE_CPU
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
+
  paddle_mobile.SetThreadNum(4);
  bool optimize = true;
  auto time1 = time();

--- a/test/net/test_googlenetv1_combine.cpp
+++ b/test/net/test_googlenetv1_combine.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_googlenetv1_combined) + "/model",
+                         std::string(g_googlenetv1_combined) + "/params",
+                         false)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 160, 160};
+    GetInput<float>(g_img, &input, dims);
+
+    for (int i = 0; i < input.size(); i += 1000) {
+      std::cout << input[i] << std::endl;
+    }
+    //    auto vec_result = paddle_mobile.Predict(input, dims);
+    //    std::vector<float>::iterator biggest =
+    //        std::max_element(std::begin(vec_result), std::end(vec_result));
+    //    std::cout << " Max element is " << *biggest << " at position "
+    //              << std::distance(std::begin(vec_result), biggest) <<
+    //              std::endl;
+
+    //    // 预热十次
+    //    for (int i = 0; i < 1; ++i) {
+    //      auto vec_result = paddle_mobile.Predict(input, dims);
+    //    }
+    auto time3 = time();
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+
+    for (int j = 0; j < vec_result.size(); ++j) {
+      std::cout << j << " : " << vec_result[j] << std::endl;
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
+              << std::endl;
+  }
+
+  return 0;
+}
--- a/test/net/test_inceptionv4.cpp
+++ b/test/net/test_inceptionv4.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_inceptionv4, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    //        DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
--- a/test/net/test_mobilenet+ssd.cpp
+++ b/test/net/test_mobilenet+ssd.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  auto isok = paddle_mobile.Load(
+      std::string(g_mobilenet_ssd_gesture) + "/model",
+      std::string(g_mobilenet_ssd_gesture) + "/params", true);
+  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 300, 300};
+    GetInput<float>(g_hand, &input, dims);
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  return 0;
+}
--- a/test/net/test_mobilenet.cpp
+++ b/test/net/test_mobilenet.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_mobilenet, true);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    DLOG << vec_result;
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+
+  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
+               "是否存在?"
+            << std::endl;
+  return 0;
+}
--- a/test/net/test_mobilenet_025_fssd.cpp
+++ b/test/net/test_mobilenet_025_fssd.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main(int argc, char **argv) {
+  int times = 10;
+  if (argc <= 1) {
+    times = 10;
+    std::cout << "没有输入 , 使用默认10次 " << times << std::endl;
+  } else {
+    std::string arstr = argv[1];
+    times = std::stoi(arstr);
+    std::cout << "input times: " << times << std::endl;
+  }
+
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(1);
+  auto isok =
+      paddle_mobile.Load(std::string(g_fluid_fssd_new) + "/model",
+                         std::string(g_fluid_fssd_new) + "/params", true);
+  if (isok) {
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 160, 160};
+    GetInput<float>(g_imgfssd_ar1, &input, dims);
+    std::cout << "预热10次....." << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto output = paddle_mobile.Predict(input, dims);
+    }
+    std::cout << "开始....." << std::endl;
+
+    double time_sum = 0;
+
+    for (int i = 0; i < times; ++i) {
+      auto time3 = time();
+      auto output = paddle_mobile.Predict(input, dims);
+      auto time4 = time();
+      double timeDiff = time_diff(time3, time4);
+      time_sum += timeDiff;
+      std::cout << "第" << i << "次"
+                << "predict cost :" << timeDiff << "ms" << std::endl;
+    }
+    std::cout << "平均时间:" << time_sum / times << "ms" << std::endl;
+  }
+  return 0;
+}
--- a/test/net/test_mobilenet_combine.cpp
+++ b/test/net/test_mobilenet_combine.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(std::string(g_mobilenet_combined) + "/model",
+                         std::string(g_mobilenet_combined) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<float> input;
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+
+    auto vec_result = paddle_mobile.Predict(input, dims);
+    std::vector<float>::iterator biggest =
+        std::max_element(std::begin(vec_result), std::end(vec_result));
+    std::cout << " Max element is " << *biggest << " at position "
+              << std::distance(std::begin(vec_result), biggest) << std::endl;
+
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      auto vec_result = paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  std::cout
+      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
+      << std::endl;
+  return 0;
+}
--- a/test/net/test_nlp.cpp
+++ b/test/net/test_nlp.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
+  //                     std::string(g_mobilenet_detect) + "/params", true);
+
+  auto isok = paddle_mobile.Load(g_nlp, true, false, true);
+
+  //  auto isok = paddle_mobile.Load(std::string(g_nlp) + "/model",
+  //                                 std::string(g_nlp) + "/params", false);
+  if (isok) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
+
+    std::vector<int64_t> ids{1918, 117, 55, 97, 1352, 4272, 1656, 903};
+
+    paddle_mobile::framework::LoDTensor words;
+    auto size = static_cast<int>(ids.size());
+    paddle_mobile::framework::LoD lod{{0, ids.size()}};
+    DDim dims{size, 1};
+    words.Resize(dims);
+    words.set_lod(lod);
+    DLOG << "words lod : " << words.lod();
+    auto *pdata = words.mutable_data<int64_t>();
+    size_t n = words.numel() * sizeof(int64_t);
+    DLOG << "n :" << n;
+    memcpy(pdata, ids.data(), n);
+    DLOG << "words lod 22: " << words.lod();
+    auto time3 = time();
+    for (int i = 0; i < 1; ++i) {
+      auto vec_result = paddle_mobile.PredictLod(words);
+      DLOG << *vec_result;
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
+              << std::endl;
+  }
+
+  auto time2 = time();
+  std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+  //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
+
+  std::vector<int64_t> ids{1791, 656, 1549, 281, 96};
+
+  paddle_mobile::framework::LoDTensor words;
+  auto size = static_cast<int>(ids.size());
+  paddle_mobile::framework::LoD lod{{0, ids.size()}};
+  DDim dims{size, 1};
+  words.Resize(dims);
+  words.set_lod(lod);
+  DLOG << "words lod : " << words.lod();
+  auto *pdata = words.mutable_data<int64_t>();
+  size_t n = words.numel() * sizeof(int64_t);
+  DLOG << "n :" << n;
+  memcpy(pdata, ids.data(), n);
+  DLOG << "words lod 22: " << words.lod();
+  auto time3 = time();
+  for (int i = 0; i < 1; ++i) {
+    auto vec_result = paddle_mobile.PredictLod(words);
+    DLOG << *vec_result;
+  }
+  auto time4 = time();
+  std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
+            << std::endl;
+  return 0;
+}
--- a/test/net/test_resnet.cpp
+++ b/test/net/test_resnet.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+#ifdef PADDLE_MOBILE_FPGA
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+#endif
+
+#ifdef PADDLE_MOBILE_CPU
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+#endif
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(g_resnet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 32, 32};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+#ifndef PADDLE_MOBILE_FPGA
+    //   预热十次
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+
+#else
+    auto time3 = time();
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(10);
+    paddle_mobile.Predict_From(10);
+    auto tensor_ptr = paddle_mobile.FetchResult(9);
+    std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
+              << std::endl;
+    auto result_ptr = paddle_mobile.FetchResult();
+    std::cout << "Result tensor element number: " << result_ptr->numel()
+              << std::endl;
+
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
+#endif
+  }
+  return 0;
+}
--- a/test/net/test_squeezenet.cpp
+++ b/test/net/test_squeezenet.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  if (paddle_mobile.Load(g_squeezenet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+
+  return 0;
+}
--- a/test/net/test_yolo.cpp
+++ b/test/net/test_yolo.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+  if (paddle_mobile.Load(g_yolo, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<int64_t> dims{1, 3, 227, 227};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热十次
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time3 = time();
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  return 0;
+}
--- a/test/net/test_yolo_combined.cpp
+++ b/test/net/test_yolo_combined.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  //  ../../../test/models/googlenet
+  //  ../../../test/models/mobilenet
+  auto time1 = time();
+
+  if (paddle_mobile.Load(std::string(g_yolo_combined) + "/model",
+                         std::string(g_yolo_combined) + "/params", true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    std::vector<int64_t> dims{1, 3, 416, 416};
+    std::vector<float> input;
+
+    GetInput<float>(g_test_image_desktop_1_3_416_416_nchw_float, &input, dims);
+    std::cout << "input.size():  " << input.size() << std::endl;
+    for (int j = 0; j < 100; ++j) {
+      std::cout << j << " :  " << input[j] << std::endl;
+    }
+    //        // 预热十次
+    //        for (int i = 0; i < 10; ++i) {
+    //            paddle_mobile.Predict(input, dims);
+    //        }
+    auto time3 = time();
+    const vector<float> vector_out = paddle_mobile.Predict(input, dims);
+    std::cout << "--------------------------------------------" << std::endl;
+
+    for (float i : vector_out) {
+      std::cout << i << std::endl;
+    }
+
+    std::cout << "--------------------------------------------" << std::endl;
+
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+
+    auto time4 = time();
+    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
+              << std::endl;
+  }
+  return 0;
+}
--- a/test/operators/test_batchnorm_op.cpp
+++ b/test/operators/test_batchnorm_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/batchnorm_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestBatchNormOp {
+ public:
+  explicit TestBatchNormOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "batch_norm" &&
+            op->Input("X")[0] == "conv2d_5.tmp_0") {
+          DLOG << " mul attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " Input X is : " << op->Input("X")[0];
+          DLOG << " Input Mean is : " << op->Input("Mean")[0];
+          DLOG << " Input Variance is : " << op->Input("Variance")[0];
+          DLOG << " Input Scale is : " << op->Input("Scale")[0];
+          DLOG << " Input Bias is : " << op->Input("Bias")[0];
+          DLOG << " Output Y is : " << op->Output("Y")[0];
+          DLOG << " epsilon : " << op->GetAttrMap().at("epsilon").Get<float>();
+          std::shared_ptr<operators::BatchNormOp<Dtype, float>> lrn =
+              std::make_shared<operators::BatchNormOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(lrn);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2,
+                                     const Tensor &t3, const Tensor &t4,
+                                     const Tensor &t5) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("conv2d_5.tmp_0");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *mean_feed_value = scope->Var("batch_norm_10.w_1");
+    auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
+    tensor_mean->ShareDataWith(t2);
+
+    Variable *scale_feed_value = scope->Var("batch_norm_10.w_0");
+    auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
+    tensor_scale->ShareDataWith(t3);
+
+    Variable *variance_feed_value = scope->Var("batch_norm_10.w_2");
+    auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
+    tensor_variance->ShareDataWith(t4);
+
+    Variable *bias_feed_value = scope->Var("batch_norm_10.b_0");
+    auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
+    tensor_bias->ShareDataWith(t5);
+
+    Variable *output = scope->Var("batch_norm_10.tmp_2");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({1, 256, 38, 38});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict_bn(t1, t2, t3, t4, t5, 0);
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict_bn(const Tensor &t1, const Tensor &t2, const Tensor &t3,
+                  const Tensor &t4, const Tensor &t5, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestBatchNormOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run BatchNormOp Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+
+  /// input x (4,10,2,2)
+  paddle_mobile::framework::Tensor inputx1;
+  SetupTensor<float>(&inputx1, {1, 256, 38, 38}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx1_ptr = inputx1.data<float>();
+
+  paddle_mobile::framework::Tensor mean;
+  SetupTensor<float>(&mean, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *mean_ptr = mean.data<float>();
+
+  paddle_mobile::framework::Tensor scale;
+  SetupTensor<float>(&scale, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *scale_ptr = scale.data<float>();
+
+  paddle_mobile::framework::Tensor variance;
+  SetupTensor<float>(&variance, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *variance_ptr = variance.data<float>();
+
+  paddle_mobile::framework::Tensor bias;
+  SetupTensor<float>(&bias, {256}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *bias_ptr = bias.data<float>();
+
+  paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
+      program);
+
+  auto output_bn =
+      testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias);
+  auto *output_bn_ptr = output_bn->data<float>();
+
+  DLOG << " (" << inputx1_ptr[0] << " - " << mean_ptr[0] << ")/(("
+       << variance_ptr[0] << " + 0.00001"
+       << ")^0.5)* " << scale_ptr[0] << " + " << bias_ptr[0] << " = ";
+  DLOG << output_bn_ptr[0];
+
+  DLOG << "input_ptr 0 : " << inputx1_ptr[0];
+  DLOG << "output_ptr 0 : " << output_bn_ptr[0];
+
+  return 0;
+}
--- a/test/operators/test_box_coder_op.cpp
+++ b/test/operators/test_box_coder_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "../test_include.h"
+#include "operators/box_coder_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestBoxCoderOp {
+ public:
+  explicit TestBoxCoderOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (auto op : ops) {
+        if (op->Type() == "box_coder" &&
+            op->Input("PriorBox")[0] == "concat_0.tmp_0") {
+          DLOG << " mul attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " Input PriorBox is : " << op->Input("PriorBox")[0];
+          DLOG << " Input PriorBoxVar is : " << op->Input("PriorBoxVar")[0];
+          DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0];
+          DLOG << " OutputBox is : " << op->Output("OutputBox")[0];
+          DLOG << " code_type : "
+               << op->GetAttrMap().at("code_type").Get<std::string>();
+          std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
+              std::make_shared<operators::BoxCoderOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(boxcoder);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict_boxcoder(const Tensor &t1, const Tensor &t2,
+                                           const Tensor &t3) {
+    // feed
+    auto scope = program_.scope;
+    Variable *prior_box = scope->Var("concat_0.tmp_0");
+    auto tensor_x1 = prior_box->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *prior_box_var = scope->Var("concat_1.tmp_0");
+    auto tensor_x2 = prior_box_var->GetMutable<LoDTensor>();
+    tensor_x2->ShareDataWith(t2);
+
+    Variable *target_box = scope->Var("concat_2.tmp_0");
+    auto tensor_x3 = target_box->GetMutable<LoDTensor>();
+    tensor_x3->ShareDataWith(t3);
+
+    Variable *boxes_output = scope->Var("box_coder_0.tmp_0");
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
+    boxes_output_tensor->mutable_data<float>({1, 1917, 4});
+
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> outbox_tensor = std::make_shared<LoDTensor>();
+    outbox_tensor.reset(boxes_output_tensor);
+
+    predict_boxcoder(t1, t2, t3, 0);
+
+    return outbox_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict_boxcoder(const Tensor &t1, const Tensor &t2, const Tensor &t3,
+                        int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestBoxCoderOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run BoxCoderOp Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+
+  paddle_mobile::framework::Tensor priorbox;
+  SetupTensor<float>(&priorbox, {1917, 4}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *priorbox_ptr = priorbox.data<float>();
+
+  paddle_mobile::framework::Tensor priorboxvar;
+  SetupTensor<float>(&priorboxvar, {1917, 4}, static_cast<float>(0.1),
+                     static_cast<float>(0.2));
+  auto *priorboxvar_ptr = priorboxvar.data<float>();
+
+  paddle_mobile::framework::Tensor targetbox;
+  SetupTensor<float>(&targetbox, {1, 1917, 4}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *targetbox_ptr = targetbox.data<float>();
+
+  paddle_mobile::framework::TestBoxCoderOp<paddle_mobile::CPU> testBoxCoderOp(
+      program);
+
+  auto output_boxcoder =
+      testBoxCoderOp.predict_boxcoder(priorbox, priorboxvar, targetbox);
+  auto output_boxcoder_ptr = output_boxcoder->data<float>();
+
+  for (int i = 0; i < output_boxcoder->numel(); i++) {
+    DLOG << output_boxcoder_ptr[i];
+  }
+  DLOGF("\n");
+  /// testing 25th bbox.
+  DLOG << "PriorBox**************";
+  DLOG << priorbox_ptr[100];
+  DLOG << priorbox_ptr[101];
+  DLOG << priorbox_ptr[102];
+  DLOG << priorbox_ptr[103];
+  DLOG << "PriorBoxVar**************";
+  DLOG << priorboxvar_ptr[100];
+  DLOG << priorboxvar_ptr[101];
+  DLOG << priorboxvar_ptr[102];
+  DLOG << priorboxvar_ptr[103];
+  DLOG << "TargetBox***************";
+  DLOG << targetbox_ptr[100];
+  DLOG << targetbox_ptr[101];
+  DLOG << targetbox_ptr[102];
+  DLOG << targetbox_ptr[103];
+  DLOG << "OutputBox**************";
+  DLOG << output_boxcoder_ptr[100];
+  DLOG << output_boxcoder_ptr[101];
+  DLOG << output_boxcoder_ptr[102];
+  DLOG << output_boxcoder_ptr[103];
+
+  DLOG << "***********----------------------**************";
+  auto priorbox_w = priorbox_ptr[102] - priorbox_ptr[100];
+  auto priorbox_h = priorbox_ptr[103] - priorbox_ptr[101];
+  auto priorbox_center_x = (priorbox_ptr[100] + priorbox_ptr[102]) / 2;
+  auto priorbox_center_y = (priorbox_ptr[101] + priorbox_ptr[103]) / 2;
+  DLOG << "prior box width : " << priorbox_w;
+  DLOG << "prior box height : " << priorbox_h;
+  DLOG << "prior box center x : " << priorbox_center_x;
+  DLOG << "prior box center y : " << priorbox_center_y;
+  auto target_box_center_x =
+      priorboxvar_ptr[100] * targetbox_ptr[100] * priorbox_w +
+      priorbox_center_x;
+  DLOG << "target_box_center_x : " << target_box_center_x;
+  auto target_box_center_y =
+      priorboxvar_ptr[101] * targetbox_ptr[101] * priorbox_h +
+      priorbox_center_y;
+  DLOG << "target_box_center_y : " << target_box_center_y;
+  auto target_box_width =
+      std::exp(priorboxvar_ptr[102] * targetbox_ptr[102]) * priorbox_w;
+  DLOG << "target_box_width : " << target_box_width;
+  auto target_box_height =
+      std::exp(priorboxvar_ptr[103] * targetbox_ptr[103]) * priorbox_h;
+  DLOG << "target_box_height : " << target_box_height;
+  DLOG << "pre x min : " << target_box_center_x - target_box_width / 2;
+  DLOG << "pre y min : " << target_box_center_y - target_box_height / 2;
+  DLOG << "pre x max : " << target_box_center_x + target_box_width / 2;
+  DLOG << "pre y max : " << target_box_center_y + target_box_height / 2;
+  return 0;
+}
--- a/test/operators/test_concat_op.cpp
+++ b/test/operators/test_concat_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/concat_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::CPU, float>>
+      executor(program, "concat");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  // 5. test one example.
+  int input_n = 1;
+  int input_c = 2;
+  int input_h = 0;
+  int input_w = 1;
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
+  /// inputx1 (4,10,2,2),
+  /// inputx2 (4,20,2,2),
+  /// inputx3 (4,30,2,2),
+  /// inputx4 (4,40,2,2),
+  /// axis = 1
+  /// output (4,100,2,2)
+  int input_index =
+      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
+  int output_index = input_n * 100 * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
+                     input_h * 2 + input_w;
+
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
+  return 0;
+}
--- a/test/operators/test_conv_add_relu_op.cpp
+++ b/test/operators/test_conv_add_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/fusion_conv_add_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_googlenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<
+      paddle_mobile::CPU,
+      paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_relu", true);
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < 25; ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_cov_op.cpp
+++ b/test/operators/test_cov_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/conv_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::GPU_MALI> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_googlenet);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::GPU_MALI, paddle_mobile::operators::ConvOp<
+                                             paddle_mobile::GPU_MALI, float>>
+      executor(program, "conv2d");
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
+  auto output = executor.Predict(input, "data", "conv2d_0.tmp_0", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < 20; ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_depthwise_conv_op.cpp
+++ b/test/operators/test_depthwise_conv_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/depthwise_conv_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet_ssd);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::DepthwiseConvOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "depthwise_conv2d");
+
+  paddle_mobile::framework::LoDTensor input;
+  // GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
+  // use SetupTensor if not has local input image .
+  SetupTensor<float>(&input, {1, 32, 150, 150}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150});
+  auto output = executor.Predict(input, "batch_norm_0.tmp_3",
+                                 "depthwise_conv2d_0.tmp_0", out_ddim);
+
+  auto output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_elementwise_add_op.cpp
+++ b/test/operators/test_elementwise_add_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::ElementwiseAddOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "elementwise_add");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 3, 224, 224}, 0, 1);
+  input_tensors.push_back(input1);
+
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {224}, 0, 1);
+  input_tensors.push_back(input2);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_2.tmp_2",
+      "batch_norm_0.tmp_3",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"elementwise_add_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+  /// output (1,3,224,224)
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();
+
+  DLOG << input1_data[226] << " + " << input2_data[2] << " = "
+       << output0_data[226];
+}
--- a/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ b/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_include.h"
+#include "operators/fusion_conv_add_bn_relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //  ../models/image_classification_resnet.inference.model
+  auto program = loader.Load(g_mobilenet, true);
+
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::FusionConvAddBNReluOp<
+                    paddle_mobile::CPU, float>>
+      executor(program, "fusion_conv_add_bn_relu", true);
+
+  std::cout << "executor 4 test: " << std::endl;
+
+  paddle_mobile::framework::Tensor input;
+  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
+  //  // use SetupTensor if not has local input image .
+  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
+  //                     static_cast<float>(1));
+
+  DLOG << " fuck: " << input;
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
+  std::cout << "before predict: " << std::endl;
+  auto output =
+      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
+  std::cout << "after predict " << std::endl;
+  auto output_ptr = output->data<float>();
+
+  int stride = output->numel() / 100;
+  for (int i = 0; i < 100; i++) {
+    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
+  }
+
+  //  for (int i = 0; i < 100; i++) {
+  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
+  //  }
+
+  //  for (int j = 0; j < output->numel(); ++j) {
+  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
+  //  }
+  std::cout << std::endl;
+  return 0;
+}
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <framework/program/program-optimize/program_optimize.h>
+#include "../test_include.h"
+#include "operators/fusion_fc_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestFcOp {
+ public:
+  explicit TestFcOp(const Program<Dtype> p) : program_(p) {
+    use_optimize_ = true;
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (int i = 0; i < blocks.size(); ++i) {
+      std::shared_ptr<BlockDesc> block_desc = blocks[i];
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (int j = 0; j < ops.size(); ++j) {
+        std::shared_ptr<OpDesc> op = ops[j];
+        if (op->Type() == "fc" && op->Input("X")[0] == "pool2d_13.tmp_0") {
+          DLOG << " fc attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " Input X is : " << op->Input("X")[0];
+          DLOG << " Input Y is : " << op->Input("Y")[0];
+          DLOG << " Input Y is : " << op->Input("Z")[0];
+          DLOG << " Output Out is : " << op->Output("Out")[0];
+          std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
+              std::make_shared<operators::FusionFcOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(testOp);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2,
+                                  const Tensor &t3) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
+    auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
+    tensor_x->ShareDataWith(t1);
+
+    Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
+    auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
+    tensor_y->ShareDataWith(t2);
+
+    Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
+    auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
+    tensor_z->ShareDataWith(t3);
+
+    Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
+    auto *output_tensor = con_output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({3, 10});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(t1, t2, t3, 0);
+    return out_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict(const Tensor &t1, const Tensor &t2, const Tensor &t3,
+               int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestFcOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run Fc Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  //    "../../../test/models/googlenet"
+  auto program = loader.Load(g_googlenet);
+  paddle_mobile::framework::ProgramOptimize optimize;
+  //  program.originProgram->Description("origin");
+  auto optimize_program = optimize.FusionOptimize(program.originProgram);
+
+  program.optimizeProgram = optimize_program;
+
+  if (optimize_program != nullptr) {
+    optimize_program->Description("optimize");
+  } else {
+    LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
+  }
+
+  /// input x (1,3,224,224)
+  paddle_mobile::framework::LoDTensor inputx;
+  SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
+                     static_cast<float>(1));
+  auto *inputx_ptr = inputx.data<float>();
+  /// input y (224,)
+  paddle_mobile::framework::LoDTensor inputy;
+  SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
+                     static_cast<float>(1.5));
+  auto *inputy_ptr = inputy.data<float>();
+
+  paddle_mobile::framework::LoDTensor inputz;
+  SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputz_ptr = inputz.data<float>();
+
+  paddle_mobile::framework::TestFcOp<paddle_mobile::CPU> testFcOp(program);
+
+  auto output = testFcOp.predict(inputx, inputy, inputz);
+  auto *output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << "value of output: " << output_ptr[j];
+  }
+
+  DLOG << "1 (3,64) * 2 (64,10) = 96(3,10)";
+  DLOG << "output : 96(3,10) + bias(10)";
+
+  return 0;
+}
--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/gru_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_nlp);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::GruOp<paddle_mobile::CPU, float>>
+      executor(program, "gru");
+
+  return 0;
+}
--- a/test/operators/test_im2sequence_op.cpp
+++ b/test/operators/test_im2sequence_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/im2sequence_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_ocr_recg);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
+      executor(program, "im2sequence");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {2, 2, 3, 3}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_19.tmp_1",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"im2sequence_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({8, 9});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < input_tensors[0].numel(); ++j) {
+    DLOG << " value of input: " << input1_data[j];
+  }
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
--- a/test/operators/test_lrn_op.cpp
+++ b/test/operators/test_lrn_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/lrn_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::LrnOp<paddle_mobile::CPU, float>>
+      executor(program, "lrn");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 4, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"pool1_norm1.tmp_1"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  DLOG << " LrnOp input: ";
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 4; j++) {
+      for (int c = 0; c < 2; c++) {
+        for (int d = 0; d < 2; d++) {
+          DLOGF("%f ", input1_data[i * 16 + j * 4 + c * 2 + d]);
+        }
+        DLOGF("\n");
+      }
+      DLOGF("\n");
+    }
+    DLOGF("\n");
+  }
+  DLOG << " LrnOp output: ";
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 4; j++) {
+      for (int c = 0; c < 2; c++) {
+        for (int d = 0; d < 2; d++) {
+          DLOGF("%f ", output0_data[i * 16 + j * 4 + c * 2 + d]);
+        }
+        DLOGF("\n");
+      }
+      DLOGF("\n");
+    }
+    DLOGF("\n");
+  }
+  DLOG << input1_data[0] << " / ((1 + 0.00002 * ( " << input1_data[0] << "^2 + "
+       << input1_data[4] << "^2 + " << input1_data[8] << "^2 ))^0.75) = ";
+  DLOG << output0_data[0];
+  return 0;
+}
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/mul_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::MulOp<paddle_mobile::CPU, float>>
+      executor(program, "mul");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {3, 2, 1, 1}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {2, 3}, 0, 1);
+  input_tensors.push_back(input2);
+
+  // 2. input_names
+  vector<string> input_names({
+      "pool2d_0.tmp_0",
+      "fc_0.w_0",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"fc_0.tmp_0"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 3});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  auto dim_1 = input1.numel() / input1.dims()[0];
+  DLOG << " input1 : ";
+  for (int i = 0; i < input1.dims()[0]; ++i) {
+    for (int j = 0; j < dim_1; ++j) {
+      DLOGF("%f ", input1_data[i * dim_1 + j]);
+    }
+    DLOGF("\n");
+  }
+
+  auto dim_2 = input2.numel() / input2.dims()[0];
+  DLOG << " input2 : ";
+  for (int i = 0; i < input2.dims()[0]; ++i) {
+    for (int j = 0; j < dim_2; ++j) {
+      DLOGF("%f ", input2_data[i * dim_2 + j]);
+    }
+    DLOGF("\n");
+  }
+
+  auto dim_output0 = output[0]->numel() / output[0]->dims()[0];
+  DLOG << " output : ";
+  for (int i = 0; i < output[0]->dims()[0]; ++i) {
+    for (int j = 0; j < dim_output0; ++j) {
+      DLOGF("%f ", output0_data[i * dim_2 + j]);
+    }
+    DLOGF("\n");
+  }
+
+  /// output (3,3)
+  DLOG << "output memory size : " << output[0]->memory_size();
+  DLOG << "output numel : " << output[0]->numel();
+
+  DLOG << input1_data[0] << " x " << input2_data[0] << " + " << input1_data[1]
+       << " x " << input2_data[0 + 3] << " = " << output0_data[0];
+  return 0;
+}
--- a/test/operators/test_multiclass_nms_op.cpp
+++ b/test/operators/test_multiclass_nms_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "../test_include.h"
+#include "operators/multiclass_nms_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestMultiClassNMSOp {
+ public:
+  explicit TestMultiClassNMSOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (auto op : ops) {
+        if (op->Type() == "multiclass_nms" &&
+            op->Input("BBoxes")[0] == "box_coder_0.tmp_0") {
+          DLOG << " mul attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " BBoxes is : " << op->Input("BBoxes")[0];
+          DLOG << " Scores is : " << op->Input("Scores")[0];
+          DLOG << " Out is : " << op->Output("Out")[0];
+          DLOG << " keep_top_k : "
+               << op->GetAttrMap().at("keep_top_k").Get<int>();
+          DLOG << " background_label : "
+               << op->GetAttrMap().at("background_label").Get<int>();
+          DLOG << " nms_eta : " << op->GetAttrMap().at("nms_eta").Get<float>();
+          DLOG << " nms_threshold : "
+               << op->GetAttrMap().at("nms_threshold").Get<float>();
+          DLOG << " nms_top_k : "
+               << op->GetAttrMap().at("nms_top_k").Get<int>();
+          DLOG << " score_threshold : "
+               << op->GetAttrMap().at("score_threshold").Get<float>();
+          //                            DLOG << " variances : " <<
+          //                            op->GetAttrMap().at("variances").Get<std::vector<float>>();
+          //                            DLOG << " aspect_ratios : " <<
+          //                            op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
+          //                            DLOG << " min_sizes : " <<
+          //                            op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
+          //                            DLOG << " max_sizes : " <<
+          //                            op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
+          std::shared_ptr<operators::MultiClassNMSOp<Dtype, float>> priorbox =
+              std::make_shared<operators::MultiClassNMSOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(priorbox);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("box_coder_0.tmp_0");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *x2_feed_value = scope->Var("transpose_12.tmp_0");
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
+    tensor_x2->ShareDataWith(t2);
+
+    Variable *output = scope->Var("detection_output_0.tmp_0");
+    auto *output_tensor = output->GetMutable<LoDTensor>();
+    output_tensor->mutable_data<float>({1917, 6});
+
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
+    out_tensor.reset(output_tensor);
+
+    predict(t1, t2, 0);
+
+    return out_tensor;
+    // return outvars_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict(const Tensor &t1, const Tensor &t2, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestMultiClassNMSOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run MulticlassNMS Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string("../../test/models/mobilenet+ssd"));
+
+  /// input x (1,3,300,300)
+  paddle_mobile::framework::Tensor inputx1;
+  SetupTensor<float>(&inputx1, {10, 1917, 4}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx1_ptr = inputx1.data<float>();
+
+  paddle_mobile::framework::Tensor inputx2;
+  SetupTensor<float>(&inputx2, {10, 21, 1917}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx2_ptr = inputx2.data<float>();
+
+  paddle_mobile::framework::TestMultiClassNMSOp<paddle_mobile::CPU>
+      testMultiClassNMSOp(program);
+
+  auto output = testMultiClassNMSOp.predict(inputx1, inputx2);
+  auto *output_ptr = output->data<float>();
+
+  for (int i = 0; i < output->numel(); i++) {
+    DLOG << output_ptr[i];
+  }
+  return 0;
+}
--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/pool_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_googlenet));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::PoolOp<paddle_mobile::CPU, float>>
+      executor(program, "pool2d");
+
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 64, 112, 112}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
+  auto output =
+      executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
+
+  float *output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  return 0;
+}
--- a/test/operators/test_prelu_op.cpp
+++ b/test/operators/test_prelu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../executor_for_test.h"
+#include "../test_include.h"
+#include "operators/prelu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::PReluOp<paddle_mobile::CPU, float>>
+      executor(program, "prelu");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
--- a/test/operators/test_prior_box_op.cpp
+++ b/test/operators/test_prior_box_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "../test_include.h"
+#include "operators/prior_box_op.h"
+
+namespace paddle_mobile {
+namespace framework {
+
+template <typename Dtype>
+class TestPriorBoxOp {
+ public:
+  explicit TestPriorBoxOp(const Program<Dtype> p) : program_(p) {
+    if (use_optimize_) {
+      to_predict_program_ = program_.optimizeProgram;
+    } else {
+      to_predict_program_ = program_.originProgram;
+    }
+
+    const std::vector<std::shared_ptr<BlockDesc>> blocks =
+        to_predict_program_->Blocks();
+    //  DLOG << " **block size " << blocks.size();
+    for (auto block_desc : blocks) {
+      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
+      //    DLOG << " ops " << ops.size();
+      for (auto op : ops) {
+        if (op->Type() == "prior_box" &&
+            op->Input("Input")[0] == "batch_norm_26.tmp_3") {
+          DLOG << " mul attr size: " << op->GetAttrMap().size();
+          DLOG << " inputs size: " << op->GetInputs().size();
+          DLOG << " outputs size: " << op->GetOutputs().size();
+          DLOG << " Input is : " << op->Input("Input")[0];
+          DLOG << " Image is : " << op->Input("Image")[0];
+          DLOG << " Output Boxes is : " << op->Output("Boxes")[0];
+          DLOG << " Output Variances is : " << op->Output("Variances")[0];
+          DLOG << " offset : " << op->GetAttrMap().at("offset").Get<float>();
+          DLOG << " step_h : " << op->GetAttrMap().at("step_h").Get<float>();
+          DLOG << " step_w : " << op->GetAttrMap().at("step_w").Get<float>();
+          DLOG << " flip : " << op->GetAttrMap().at("flip").Get<bool>();
+          DLOG << " clip : " << op->GetAttrMap().at("clip").Get<bool>();
+          //                            DLOG << " variances : " <<
+          //                            op->GetAttrMap().at("variances").Get<std::vector<float>>();
+          //                            DLOG << " aspect_ratios : " <<
+          //                            op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
+          //                            DLOG << " min_sizes : " <<
+          //                            op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
+          //                            DLOG << " max_sizes : " <<
+          //                            op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
+          std::shared_ptr<operators::PriorBoxOp<Dtype, float>> priorbox =
+              std::make_shared<operators::PriorBoxOp<Dtype, float>>(
+                  op->Type(), op->GetInputs(), op->GetOutputs(),
+                  op->GetAttrMap(), program_.scope);
+          ops_of_block_[*block_desc.get()].push_back(priorbox);
+        }
+      }
+    }
+  }
+
+  std::shared_ptr<Tensor> predict_priorbox(const Tensor &t1, const Tensor &t2) {
+    // feed
+    auto scope = program_.scope;
+    Variable *x1_feed_value = scope->Var("image");
+    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
+    tensor_x1->ShareDataWith(t1);
+
+    Variable *x2_feed_value = scope->Var("batch_norm_26.tmp_3");
+    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
+    tensor_x2->ShareDataWith(t2);
+
+    Variable *boxes_output = scope->Var("prior_box_1.tmp_0");
+    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
+    boxes_output_tensor->mutable_data<float>({10, 10, 6, 4});
+
+    Variable *variances_output = scope->Var("prior_box_1.tmp_1");
+    auto *variances_output_tesnor = variances_output->GetMutable<LoDTensor>();
+    variances_output_tesnor->mutable_data<float>({10, 10, 6, 4});
+    //  DLOG << typeid(output_tensor).name();
+    //  DLOG << "output_tensor dims: " << output_tensor->dims();
+
+    std::shared_ptr<Tensor> outboxes_tensor = std::make_shared<LoDTensor>();
+    outboxes_tensor.reset(boxes_output_tensor);
+
+    std::shared_ptr<Tensor> outvars_tensor = std::make_shared<LoDTensor>();
+    outvars_tensor.reset(variances_output_tesnor);
+    predict_priorbox(t1, t2, 0);
+
+    return outboxes_tensor;
+    // return outvars_tensor;
+  }
+
+ private:
+  const framework::Program<Dtype> program_;
+  std::shared_ptr<ProgramDesc> to_predict_program_;
+  std::map<framework::BlockDesc,
+           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
+      ops_of_block_;
+  bool use_optimize_ = false;
+
+  void predict_priorbox(const Tensor &t1, const Tensor &t2, int block_id) {
+    std::shared_ptr<BlockDesc> to_predict_block =
+        to_predict_program_->Block(block_id);
+    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
+      auto op = ops_of_block_[*to_predict_block.get()][j];
+      DLOG << "op -> run()";
+      op->Run();
+    }
+  }
+};
+
+template class TestPriorBoxOp<CPU>;
+}  // namespace framework
+}  // namespace paddle_mobile
+
+int main() {
+  DLOG << "----------**********----------";
+  DLOG << "begin to run PriorBoxOp Test";
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+
+  /// input x (1,3,300,300)
+  paddle_mobile::framework::Tensor input_image;
+  SetupTensor<float>(&input_image, {1, 3, 300, 300}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *input_image_ptr = input_image.data<float>();
+
+  paddle_mobile::framework::Tensor inputx1;
+  SetupTensor<float>(&inputx1, {1, 1024, 10, 10}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto *inputx1_ptr = inputx1.data<float>();
+
+  paddle_mobile::framework::TestPriorBoxOp<paddle_mobile::CPU> testPriorBoxOp(
+      program);
+
+  auto output_priorbox = testPriorBoxOp.predict_priorbox(input_image, inputx1);
+  auto *output_priorbox_ptr = output_priorbox->data<float>();
+
+  for (int i = 0; i < output_priorbox->numel(); i++) {
+    DLOG << output_priorbox_ptr[i];
+  }
+  return 0;
+}
--- a/test/operators/test_relu_op.cpp
+++ b/test/operators/test_relu_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/relu_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(g_resnet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ReluOp<paddle_mobile::CPU, float>>
+      executor(program, "relu");
+
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
+  input_tensors.push_back(input1);
+
+  // 2. input_names
+  vector<string> input_names({
+      "batch_norm_0.tmp_2",
+  });
+
+  // 3. output_names
+  vector<string> output_names({"batch_norm_0.tmp_3"});
+
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
+  out_ddims.push_back(out_ddim);
+
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+
+  auto output0_data = output[0]->data<float>();
+
+  for (int j = 0; j < output[0]->numel(); ++j) {
+    DLOG << " value of output: " << output0_data[j];
+  }
+  return 0;
+}
--- a/test/operators/test_reshape_op.cpp
+++ b/test/operators/test_reshape_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/reshape_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ReshapeOp<paddle_mobile::CPU, float>>
+      executor(program, "reshape");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
+  auto output =
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "input : ";
+  for (int j = 0; j < input.numel(); ++j) {
+    DLOG << " index " << j << " : " << input_ptr[j];
+  }
+
+  DLOG << "output : ";
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " index " << j << " : " << output_ptr[j];
+  }
+
+  return 0;
+}
--- a/test/operators/test_resize_op.cpp
+++ b/test/operators/test_resize_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/resize_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::ResizeOp<paddle_mobile::CPU, float>>
+      executor(program, "resize");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
+  auto output =
+      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "input : ";
+  for (int j = 0; j < input.numel(); ++j) {
+    DLOG << " index " << j << " : " << input_ptr[j];
+  }
+
+  DLOG << "output : ";
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " index " << j << " : " << output_ptr[j];
+  }
+
+  return 0;
+}
--- a/test/operators/test_scale_op.cpp
+++ b/test/operators/test_scale_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/scale_op.h"
+
+int main() {}
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../../src/operators/kernel/sigmoid_kernel.h"
+#include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h"
+#include "../test_helper.h"
+#include "io/executor.h"
+
+int main() {
+  paddle_mobile::framework::Tensor input;
+  paddle_mobile::framework::Tensor output;
+  SetupTensor<float>(&input, {1, 4, 60, 60}, static_cast<float>(0),
+                     static_cast<float>(1));
+
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 4, 60, 60});
+  output.Resize(out_ddim);
+  paddle_mobile::operators::sigmoid(&input, &output);
+  auto *output_ptr = output.data<float>();
+  for (int j = 0; j < output.numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+  DLOG << 5;
+  return 0;
+}
--- a/test/operators/test_slice_op.cpp
+++ b/test/operators/test_slice_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+#include "operators/slice_op.h"
+
+int main() {}
--- a/test/operators/test_softmax_op.cpp
+++ b/test/operators/test_softmax_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+
+#include "operators/softmax_op.h"
+
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+  Executor4Test<paddle_mobile::CPU,
+                paddle_mobile::operators::SoftmaxOp<paddle_mobile::CPU, float>>
+      executor(program, "softmax");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 1000}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 1000});
+  auto output =
+      executor.Predict(input, "reshape_0.tmp_0", "softmax_0.tmp_0", out_ddim);
+  auto *output_ptr = output->data<float>();
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " value of output: " << output_ptr[j];
+  }
+
+  return 0;
+}
--- a/test/operators/test_transpose_op.cpp
+++ b/test/operators/test_transpose_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/transpose_op.h"
+int main() {
+  paddle_mobile::Loader<paddle_mobile::CPU> loader;
+  auto program = loader.Load(std::string(g_mobilenet_ssd));
+  if (program.originProgram == nullptr) {
+    DLOG << "program read file";
+  }
+  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::TransposeOp<
+                                        paddle_mobile::CPU, float>>
+      executor(program, "transpose");
+  paddle_mobile::framework::Tensor input;
+  SetupTensor<float>(&input, {1, 2, 3, 4}, static_cast<float>(0),
+                     static_cast<float>(1));
+  auto input_ptr = input.data<float>();
+  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2});
+  auto output =
+      executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
+  auto *output_ptr = output->data<float>();
+
+  DLOG << "input : ";
+  for (int j = 0; j < input.numel(); ++j) {
+    DLOG << " index " << j << " : " << input_ptr[j];
+  }
+
+  DLOG << "output : ";
+  for (int j = 0; j < output->numel(); ++j) {
+    DLOG << " index " << j << " : " << output_ptr[j];
+  }
+  DLOG << " for example : ";
+  DLOG << " you can check if input[16] == output[9] ";
+  DLOG << " you can check if input[12] == output[1] ";
+  return 0;
+}
--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -41,13 +41,18 @@ static const char *g_resnet_50 = "../models/resnet_50";
 static const char *g_resnet = "../models/resnet";
 static const char *g_googlenet_combine = "../models/googlenet_combine";
 static const char *g_yolo = "../models/yolo";
+static const char *g_yolo_combined = "../models/yolo_combined";
 static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";

 static const char *g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
 static const char *g_test_image_1x3x224x224_banana =
    "../images/input_3x224x224_banana";
+static const char *g_test_image_desktop_1_3_416_416_nchw_float =
+    "../images/in_put_1_3_416_416_2";
 static const char *g_hand = "../images/hand_image";
+static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
+static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
 static const char *g_img = "../images/img.bin";

 using paddle_mobile::framework::DDim;

--- a/tools/build.sh
+++ b/tools/build.sh
@@ -92,6 +92,47 @@ build_for_android() {
    make -j 8
 }

+
+build_for_arm_linux() {
+    MODE="Release"
+    ARM_LINUX="arm-linux"
+
+    if [ "${#NETS}" -gt 1 ]; then
+        cmake .. \
+            -B"../build/release/arm-linux" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
+            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
+            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -mfpu=neon-vfpv4 -mfloat-abi=hard -ftree-vectorize -funsafe-math-optimizations  -pipe -mlittle-endian -munaligned-access" \
+            -DNET="${NETS}" \
+            -D"V7"=true
+    else
+        cmake .. \
+            -B"../build/release/arm-linux" \
+            -DCMAKE_BUILD_TYPE="${MODE}" \
+            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
+            -DCMAKE_CXX_FLAGS="-std=c++14 -mcpu=cortex-a53 -mtune=cortex-a53 -mfpu=neon-vfpv4 -mfloat-abi=hard -ftree-vectorize -funsafe-math-optimizations  -pipe -mlittle-endian -munaligned-access" \
+            -DNET="${NETS}" \
+            -D"V7"=true
+    fi
+
+    cd "../build/release/arm-linux"
+    make -j 8
+    cd "../../../test/"
+    DIRECTORY="models"
+    if [ "`ls -A $DIRECTORY`" = "" ]; then
+        echo "$DIRECTORY is indeed empty pull images"
+        wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip
+        unzip paddle-mobile%2FmodelsAndImages.zip
+        mv modelsAndImages/images/ images
+        mv modelsAndImages/models/ models
+        rm -rf paddle-mobile%2FmodelsAndImages.zip
+        rm -rf __MACOS
+    else
+        echo "$DIRECTORY is indeed not empty, DONE!"
+    fi
+
+}
+
 build_for_ios() {
 #    rm -rf "../build"
    PLATFORM="ios"
@@ -135,7 +176,7 @@ if [ $# -lt 1 ]; then
    echo "sample usage: ./build.sh android"
 else
    params=($@)
-    for(( i=1; i<$#; i++ )); do  
+    for(( i=1; i<$#; i++ )); do
        if [ ${i} != 1 ]; then
            NETS=$NETS$";"
        fi
@@ -162,6 +203,8 @@ else

    if [ $1 = "android" ]; then
        build_for_android
+    elif [ $1 = "arm_linux" ]; then
+        build_for_arm_linux
    elif [ $1 = "ios" ]; then
        build_for_ios
    else

--- a/tools/op.cmake
+++ b/tools/op.cmake
 set(FOUND_MATCH OFF)
-if ("googlenet" IN_LIST NET)
+set(CON -1)
+
+message(STATUS "nets :${NET}")
+
+list(FIND NET "googlenet" CON)
+if (CON GREATER -1)
  message("googlenet enabled")
  set(CONCAT_OP ON)
  set(CONV_OP ON)
@@ -15,7 +20,8 @@ if ("googlenet" IN_LIST NET)
  set(FOUND_MATCH ON)
 endif()

-if ("mobilenet" IN_LIST NET)
+list(FIND NET "mobilenet" CON)
+if (CON GREATER -1)
  message("mobilenet enabled")
  set(CONV_OP ON)
  set(ELEMENTWISEADD_OP ON)
@@ -33,7 +39,8 @@ if ("mobilenet" IN_LIST NET)
 endif()


-if ("mobilenetssd" IN_LIST NET)
+list(FIND NET "mobilenetssd" CON)
+if (CON GREATER -1)
  message("mobilenetssd enabled")
  set(FUSION_CONVBNRELU_OP ON)
  set(FUSION_CONVBNRELU_OP ON)
@@ -55,7 +62,8 @@ if ("mobilenetssd" IN_LIST NET)
 endif()


-if ("yolo" IN_LIST NET)
+list(FIND NET "yolo" CON)
+if (CON GREATER -1)
  message("yolo enabled")
  set(BATCHNORM_OP ON)
  set(CONV_OP ON)
@@ -65,7 +73,8 @@ if ("yolo" IN_LIST NET)
  set(FOUND_MATCH ON)
 endif()

-if ("squeezenet" IN_LIST NET)
+list(FIND NET "squeezenet" CON)
+if (CON GREATER -1)
  message("squeezenet enabled")
  set(CONCAT_OP ON)
  set(CONV_OP ON)
@@ -79,7 +88,8 @@ if ("squeezenet" IN_LIST NET)
 endif()


-if ("resnet" IN_LIST NET)
+list(FIND NET "resnet" CON)
+if (CON GREATER -1)
  message("resnet enabled")
  set(CONCAT_OP ON)
  set(CONV_OP ON)
@@ -95,7 +105,8 @@ if ("resnet" IN_LIST NET)
  set(FOUND_MATCH ON)
 endif()

-if ("FPGAnets" IN_LIST NET)
+list(FIND NET "FPGAnets" CON)
+if (CON GREATER -1)
  message("FPGAnets enabled")
  set(FUSION_CONVADDRELU_OP ON)
  set(FUSION_CONVADDBNRELU_OP ON)
@@ -114,7 +125,8 @@ if ("FPGAnets" IN_LIST NET)
  set(FOUND_MATCH ON)
 endif()

-if ("nlp" IN_LIST NET)
+list(FIND NET "nlp" CON)
+if (CON GREATER -1)
  message("nlp enabled")
  set(FUSION_FC_OP ON)
  set(LOOKUP_OP ON)
@@ -127,6 +139,43 @@ if ("nlp" IN_LIST NET)
  set(FOUND_MATCH ON)
 endif()

+list(FIND NET "mobilenetfssd" CON)
+if (CON GREATER -1)
+  message("mobilenetfssd enabled")
+  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDBNRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(SOFTMAX_OP ON)
+  set(RESHAPE_OP ON)
+  set(BILINEAR_INTERP_OP ON)
+  set(TRANSPOSE_OP ON)
+  set(CONCAT_OP ON)
+  set(PRIORBOX_OP ON)
+  set(BATCHNORM_OP ON)
+  set(BOXCODER_OP ON)
+  set(MULTICLASSNMS_OP ON)
+  set(FLATTEN_OP ON)
+  set(SPLIT_OP ON)
+  set(SHAPE_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()
+
+list(FIND NET "genet" CON)
+if (CON GREATER -1)
+  message("genet enabled")
+  set(FUSION_CONVADDPRELU_OP ON)
+  set(FUSION_CONVADDADDPRELU_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  set(CONV_TRANSPOSE_OP ON)
+  set(FUSION_CONVADDRELU_OP ON)
+  set(ELEMENTWISEADD_OP ON)
+  set(PRELU_OP ON)
+  set(POOL_OP ON)
+  set(CONCAT_OP ON)
+
+  set(FOUND_MATCH ON)
+endif()

 if(NOT FOUND_MATCH)
  message("--default--")
@@ -336,4 +385,4 @@ endif()

 if (SHAPE_OP)
  add_definitions(-DSHAPE_OP)
-endif()
\ No newline at end of file
+endif()
--- a/tools/toolchains/arm-linux-gnueabihf.cmake
+++ b/tools/toolchains/arm-linux-gnueabihf.cmake
+# CMake toolchain file for building ARM software on Linux environment
+
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR arm)
+set(CMAKE_SYSTEM_VERSION 1)
+
+message("if U build on platform . this is right.")
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
\ No newline at end of file