Merge pull request #792 from chonwhite/develop

fix:#791

Merge pull request #792 from chonwhite/develop
fix:#791
ef219662 · qnqinan · GitHub · d834fa08 · b63ce6cb · ef219662
6 changed file
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -56,11 +56,17 @@ void *fpga_malloc(size_t size) {
  return reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
 #else
-  return NULL;
+  return malloc(size);
 #endif
 }
-void fpga_free(void *ptr) { munmap(ptr, 0); }
+void fpga_free(void *ptr) {
+#ifdef PADDLE_MOBILE_OS_LINUX
+  munmap(ptr, 0);
+#else
+  free(ptr);
+#endif
+}
 void fpga_copy(void *dest, const void *src, size_t num) {
  memcpy(dest, src, num);

--- a/src/fpga/quantization.cpp
+++ b/src/fpga/quantization.cpp
@@ -48,16 +48,11 @@ static Dtype find_max(Dtype* data, int64_t num) {
 // template <typename Dtype>
 void quantize_filter(framework::Tensor* filter) {
-  DLOG << "quantilize_filter........";
+  DLOG << "quantilize_filter........" << filter->dims();
  float scale = 0;
  auto fix_range = static_cast<float>(std::pow(2, 8 - 1) - 1);
-  const auto batch_size = filter->dims()[0];
-  const auto channel = filter->dims()[1];
-  const auto height = filter->dims()[2];
-  const auto width = filter->dims()[3];
  auto* tmp_data = new int8_t[filter->numel()];
  // 32bit filter -> 8bit filter;
@@ -76,9 +71,19 @@ void quantize_filter(framework::Tensor* filter) {
    scale = (fix_range / max);
    std::memcpy(tmp_data, filter->data<int8_t>(), (size_t)filter->numel());
  }
-  // NCHW -> NHWC;
-  chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
+  if (filter->dims().size() == 4) {
-                     channel, height, width);
+    const auto batch_size = filter->dims()[0];
+    const auto channel = filter->dims()[1];
+    const auto height = filter->dims()[2];
+    const auto width = filter->dims()[3];
+    chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
+                       channel, height, width);
+  } else if (filter->dims().size() == 2) {
+    std::memcpy(filter->mutable_data<int8_t>(), tmp_data,
+                (size_t)filter->numel());
+  }
  delete tmp_data;
  filter->SetFpgaScale(scale);
 }

--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -26,7 +26,7 @@ namespace paddle_mobile {
 namespace memory {
 const int MALLOC_ALIGN = 64;
-#ifdef PADDLE_MOBILE_FPGA__VV
+#ifdef PADDLE_MOBILE_FPGA
 namespace fpga = paddle_mobile::fpga;
 void Copy(void *dst, const void *src, size_t num) {

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -32,9 +32,9 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam *param) {
    args.convert_type = fpga::DATA_FP16_TO_FP32;
    args.layout_type = fpga::LAYOUT_HWC_TO_CHW;
    args.image.address = (void *)(input_ptr);
-    args.image.height = input->dims()[1];
+    args.image.height = input->dims()[0];
-    args.image.width = input->dims()[2];
+    args.image.width = input->dims()[1];
-    args.image.channels = input->dims()[3];
+    args.image.channels = 1;
    args.output.address = output_ptr;
    param->SetFpgaArgs(args);
  }

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -31,6 +31,9 @@ elseif("FPGAnets" IN_LIST NET)
    # target_link_libraries(test-resnet paddle-mobile)
    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-tensor-quant paddle-mobile)
+    ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h  test_include.h)
+    target_link_libraries(test-fpga-concat-op paddle-mobile)
 elseif("mobilenetssd" IN_LIST NET)
    # gen test
    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h  test_include.h executor_for_test.h)

--- a/test/fpga/test_concat_op.cpp
+++ b/test/fpga/test_concat_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_include.h"
+#include "operators/concat_op.h"
+int main() {
+  paddle_mobile::Loader<paddle_mobile::FPGA> loader;
+  auto program = loader.Load(g_googlenet);
+  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
+                        "program file read fail");
+  Executor4Test<paddle_mobile::FPGA,
+                paddle_mobile::operators::ConcatOp<paddle_mobile::FPGA, float>>
+      executor(program, "concat");
+  // 1. input_tensors;
+  vector<Tensor> input_tensors;
+  Tensor input1;
+  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
+  input_tensors.push_back(input1);
+  Tensor input2;
+  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
+  input_tensors.push_back(input2);
+  Tensor input3;
+  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
+  input_tensors.push_back(input3);
+  Tensor input4;
+  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
+  input_tensors.push_back(input4);
+  // 2. input_names
+  vector<string> input_names({
+      "conv2d_3.tmp_1",
+      "conv2d_5.tmp_1",
+      "conv2d_7.tmp_1",
+      "conv2d_8.tmp_1",
+  });
+  // 3. output_names
+  vector<string> output_names({"concat_0.tmp_0"});
+  // 4. out_dims;
+  vector<DDim> out_ddims;
+  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
+  out_ddims.push_back(out_ddim);
+  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
+                                            output_names, out_ddims);
+  auto output0_data = output[0]->data<float>();
+  // 5. test one example.
+  int input_n = 1;
+  int input_c = 2;
+  int input_h = 0;
+  int input_w = 1;
+  int stride0 = input3.numel() / input3.dims()[0];
+  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
+  int stride2 = input3.dims()[3];
+  /// inputx1 (4,10,2,2),
+  /// inputx2 (4,20,2,2),
+  /// inputx3 (4,30,2,2),
+  /// inputx4 (4,40,2,2),
+  /// axis = 1
+  /// output (4,100,2,2)
+  int input_index =
+      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
+  int output_index = input_n * 100 * 2 * 2 +
+                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
+                     input_h * 2 + input_w;
+  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
+  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
+  return 0;
+}