change format

34a7a0e0 · zhangyang · 5cfea26f · 34a7a0e0 · 34a7a0e0 · 34a7a0e0
Showing with 81 addition and 20 deletion

src/fpga/V2/api.cpp src/fpga/V2/api.cpp +1 -1

src/fpga/V2/api.h src/fpga/V2/api.h +0 -4

src/fpga/V2/config.h src/fpga/V2/config.h +1 -1

src/fpga/V2/driver/pe.cpp src/fpga/V2/driver/pe.cpp +79 -14

未找到文件。
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "fpga/V2/api.h"
 #include <algorithm>
 #include "fpga/V2/bias_scale.h"
+#include "fpga/V2/config.h"
 #include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
-#include "fpga/V2/config.h"
 namespace paddle_mobile {
 namespace fpga {

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "fpga/V2/driver/pe.h"
 #include "framework/tensor.h"
 namespace paddle_mobile {
 namespace fpga {
@@ -36,7 +35,6 @@ enum LayoutType {
  LAYOUT_HWC = 0,
 };
 struct KernelArgs {
  uint32_t width;
  uint32_t height;
@@ -122,8 +120,6 @@ struct BypassArgs {
  struct ImageOutputArgs output;
 };
 int open_device();
 int close_device();
 void* fpga_malloc(size_t size);

--- a/src/fpga/V2/config.h
+++ b/src/fpga/V2/config.h
@@ -14,5 +14,5 @@ limitations under the License. */
 #pragma once
-//#define PADDLE_MOBILE_ZU5
+#define PADDLE_MOBILE_ZU5
 #define FPGA_PRINT_MODE
--- a/src/fpga/V2/driver/pe.cpp
+++ b/src/fpga/V2/driver/pe.cpp
@@ -13,13 +13,49 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "fpga/V2/driver/pe.h"
+#include "fpga/V2/config.h"
 #include "fpga/V2/driver/driver.h"
+#include "fpga/V2/filter.h"
 #include "fpga/V2/image.h"
-#include "fpga/V2/config.h"
 namespace paddle_mobile {
 namespace fpga {
-#define MUL8(x) x*8
+#define MUL8(x) (x * 8)
+#define BYPASS_DONE 1
+float Findfp16Max() {
+  uint16_t abs_vals[16];
+  uint64_t max_fp16;
+  max_fp16 = reg_readq(MUL8(49));
+  abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(50));
+  abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16));        // NOLINT
+  abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(51));
+  abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16));         // NOLINT
+  abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16));   // NOLINT
+  abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  max_fp16 = reg_readq(MUL8(52));
+  abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16));
+  abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16));  // NOLINT
+  abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32));  // NOLINT
+  abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48));  // NOLINT
+  uint16_t tmp = 0;
+  for (int i = 0; i < 16; i++) {
+    if (tmp < abs_vals[i]) {
+      tmp = abs_vals[i];
+    }
+  }
+  return fp16_2_fp32(tmp) / 127.0f;
+}
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
  ComputeBasicConv(args.conv_args[0]);
@@ -129,26 +165,55 @@ int PerformBypass(const struct BypassArgs &args) {
 #ifndef PADDLE_MOBILE_ZU5
  return 0;
 #endif
-  uint64_t bp_enable = 0x8800000000000000;
-  uint64_t *ifm_src_paddr = (uint64_t *)vaddr_to_paddr(args.image.address); // NOLINT
-  uint64_t *ifm_dst_paddr = (uint64_t *)vaddr_to_paddr(args.output.address); // NOLINT
-  uint64_t length = (args.image.channels)*(args.image.width)*(args.image.height);
-  bp_enable += length;
+  uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address);
+  uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address);
+  uint64_t bp_enable;
+  int64_t length;
+  uint64_t pixels;
+  // fp32->fp16
+  if ((args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(float);
+    bp_enable = 0x8800000000000000 + length;
+  }
+  // fp16->fp32
+  else if ((!args.input_data_type) && (args.output_data_type)) {
+    pixels = filter::calc_aligned_channel((args.image.channels)) *
+             (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    length = align_to_x((int)length, 64);  // NOLINT
+    bp_enable = 0x8a00000000000000 + length;
+  }
+  // fp16->fp16 findmax
+  else if ((!args.input_data_type) && (!args.output_data_type)) {
+    pixels = (args.image.channels) * (args.image.width) * (args.image.height);
+    length = pixels * sizeof(short);
+    bp_enable = 0x8900000000000000 + length;
+  } else {
+    return -1;
+  }
-  reg_writeq(*ifm_src_paddr, MUL8(27));
+  // start bypass
-  reg_writeq(*ifm_dst_paddr, MUL8(28));
+  reg_writeq(ifm_src_paddr, MUL8(27));
+  reg_writeq(ifm_dst_paddr, MUL8(28));
  reg_writeq(0, MUL8(0));
  reg_writeq(bp_enable, MUL8(0));
-  //poll
+  // poll
  int ret = -1;
-  ret = fpga_regpoll(MUL8(48),1, 0xffffffff);
+  ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff);
-  if(ret != -1) {
+  if (ret != -1) {
+    // clear "irq"
    reg_readq(MUL8(63));
  }
+  // get max value
+  if ((!args.input_data_type) && (!args.output_data_type)) {
+    float scale = Findfp16Max();
+    args.output.scale_address[0] = (float)(1.0 / scale);  // NOLINT
+    args.output.scale_address[1] = scale;
+  }
  return ret;
 }
 int ComputeFPGAConcat(const struct ConcatArgs &args) {