tesnor qunat

313b4b5a · chonwhite · 80fa3521 · 313b4b5a · 313b4b5a · 313b4b5a
13 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,9 +6,9 @@ option(USE_OPENMP "openmp support" OFF)
 option(USE_EXCEPTION "use std exception" ON)
 option(LOG_PROFILE "log profile" ON)
 # select the platform to build
-option(CPU "armv7 with neon" ON)
+option(CPU "armv7 with neon" OFF)
 option(MALI_GPU "mali gpu" OFF)
-option(FPGA "fpga" OFF)
+option(FPGA "fpga" ON)

 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
@@ -139,7 +139,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)

 # NET default
-set(NET "default" CACHE STRING "select net type")
+set(NET "FPGAnets" CACHE STRING "select net type")
 set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
 include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")


--- a/src/fpga/fpga_quantilization.cpp
+++ b/src/fpga/fpga_quantilization.cpp
@@ -46,8 +46,12 @@ static Dtype find_max(Dtype* data, int num) {
  return max;
 }

+
 // template <typename Dtype>
-framework::Tensor* quantify_filter(framework::Tensor* filter) {
+void quantify_filter(framework::Tensor* filter) {
+
+  DLOG << "quantilize_filter........";
+
  float scale = 0;
  float fix_range = static_cast<float>((1 << (8 - 1)) - 1);

@@ -62,25 +66,20 @@ framework::Tensor* quantify_filter(framework::Tensor* filter) {
  // 32bit filter -> 8bit filter;
  if (filter->type() == typeid(float)) {
    float* float_data = filter->data<float>();
-    float max = find_max(float_data, filter->numel());
+    float max = find_max<float>(float_data, filter->numel());

    scale = (max / fix_range);

-    framework::Tensor* filter = filter;
-    framework::Tensor* quant_filter = new framework::Tensor();
-
-    int_data = quant_filter->mutable_data<int8_t>();
    for (int i = 0; i < filter->numel(); ++i) {
      tmp_data[i] = (int8_t)float_data[i] * scale;
    }
-    filter = quant_filter;
+    int_data = filter->mutable_data<int8_t>();
  } else {
-    int8_t max = find_max(filter->data<int8_t>(), filter->numel());
+    int8_t max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
    scale = (max / fix_range);

-    int_data = filter->data<int8_t>();
    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = int_data[i];
+      tmp_data[i] = filter->data<int8_t>()[i];
    }
    int_data = filter->mutable_data<int8_t>();
  }
@@ -88,7 +87,7 @@ framework::Tensor* quantify_filter(framework::Tensor* filter) {
  chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
  delete tmp_data;
  *(filter->fpga_args().scale_pointer()) = scale;
-  return filter;
+
 }

 }  // namespace fpga

--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/fpga_quantilization.h
@@ -25,6 +25,7 @@ static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
                       int height, int width);

 // template <typename Dtype>
-framework::Tensor* quantify_filter(framework::Tensor* filter);
+void quantify_filter(framework::Tensor* filter);
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -77,6 +77,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
+      DLOG << "InferShape: ";
      op_base->InferShape();
      ops_of_block_[*block_desc.get()].push_back(op_base);
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
@@ -84,16 +85,19 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 #endif
    }
  }
+   DLOG << "InitMemory: ";
+
  if (program_.combined) {
    InitCombineMemory();
  } else {
    InitMemory();
  }
-
+  DLOG << "InitMemory end ";
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
  for (const auto &op : ops) {
+     DLOG << "Init op " << op->Type();
    op->Init();
  }
 }

--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -26,7 +26,7 @@ namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;

-#ifdef PADDLE_MOBILE_FPGA
+#ifdef PADDLE_MOBILE_FPGA__VV
 namespace fpga = paddle_mobile::fpga;

 void Copy(void *dst, const void *src, size_t num) {

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -37,7 +37,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
    param_.Out()->Resize(out_dims);
  }

-#ifdef PADDLE_MOBILE_FPGA
+#ifdef PADDLE_MOBILE_FPGA__VV
  void RunImpl() const { fpga::PerformBypass(param_.FpgaArgs()); }
  void Init() {
    const Tensor *input = param_.InputX();

--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -60,10 +60,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);

-  Tensor *quant_filter = fpga::quantify_filter(filter);
-
-  // delete original filter?
-  filter = quant_filter;
+  fpga::quantify_filter(filter);

  auto filter_ptr = filter->data<float>();
  fpga::ConvArgs convArgs;

--- a/src/operators/kernel/fpga/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_CONVADD_OP
+
+#include "operators/kernel/conv_add_kernel.h"
+#include "../central-arm-func/conv_add_arm_func.h"
+#include "fpga/fpga_quantilization.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam *param) {
+  DLOG << ">>>>>>>>>>>>>>>>>>>> ConvKernel <<<<<<<<<<<<<<<<<<<<<<<";
+  Tensor *filter = param->Filter();
+  fpga::quantify_filter(filter);
+  return true;
+}
+
+template <>
+void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam &param) const {
+  ConvAddCompute<float>(param);
+}
+
+template class ConvAddKernel<CPU, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
+
--- a/src/operators/kernel/fpga/conv_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_kernel.cpp
@@ -27,7 +27,7 @@ bool ConvKernel<FPGA, float>::Init(ConvParam *param) {

 template <>
 void ConvKernel<FPGA, float>::Compute(const ConvParam &param) const {
-  // ConvCompute<float>(param);
+  ConvCompute<float>(param);
 }

 template class ConvKernel<FPGA, float>;

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -210,7 +210,7 @@ class ConvParam : OpParam {

  const Tensor *Input() const { return input_; }

-  const Tensor *Filter() const { return filter_; }
+  Tensor *Filter() const { return filter_; }

  Tensor *Output() const { return output_; }


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -27,6 +27,11 @@ elseif("resnet" IN_LIST NET)
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)
 elseif("FPGAnets" IN_LIST NET)
+    # ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
+    # target_link_libraries(test-resnet paddle-mobile)
+    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-tensor-quant paddle-mobile)
+    
 else ()

    # gen test
@@ -173,8 +178,7 @@ else ()

 endif()

-if(FPGA)
-    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
-    target_link_libraries(test-tensor-quant paddle-mobile)
-
-endif()
+# if(FPGA)
+#     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
+#     target_link_libraries(test-tensor-quant paddle-mobile)
+# endif()
--- a/test/fpga/test_tensor_quant.cpp
+++ b/test/fpga/test_tensor_quant.cpp
@@ -12,23 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <fstream>
+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"

 int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  bool optimize = false;
-  if (paddle_mobile.Load(g_googlenet, optimize)) {
-    auto time1 = time();
-    DLOG << "load cost: " << time_diff(time1, time1) << "ms";
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+  auto time1 = time();
+  if (paddle_mobile.Load(g_resnet, true)) {
+    auto time2 = time();
+    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
+    std::vector<int64_t> dims{1, 3, 32, 32};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+    // 预热一次
+    paddle_mobile.Predict(input, dims);
    auto time3 = time();
-    auto vec_result = paddle_mobile.Predict(input, dims);
+    for (int i = 0; i < 10; ++i) {
+      paddle_mobile.Predict(input, dims);
+    }
    auto time4 = time();
-    DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
+    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
+              << std::endl;
  }
+
  return 0;
 }
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -82,6 +82,8 @@ if ("FPGAnets" IN_LIST NET)
  set(CONCAT_OP ON)
  set(SOFTMAX_OP ON)
  set(DROPOUT_OP ON)
+  set(FUSION_CONVADD_OP ON)
+  # set(CONV_OP ON)

  set(FOUND_MATCH ON)   
 endif()