diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 76d4b925e57f44c7d595995871537997bd962f69..324ee4f5381a20a9a34000045b130d61f71ec116 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -15,9 +15,9 @@ limitations under the License. */ #include "fpga/V2/api.h" #include #include "fpga/V2/bias_scale.h" +#include "fpga/V2/config.h" #include "fpga/V2/filter.h" #include "fpga/V2/image.h" -#include "fpga/V2/config.h" namespace paddle_mobile { namespace fpga { diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index 8f612ec57fb8c425a08dd095d95cdb3a99564658..aac97bec225a4940f710172c115e06452469d289 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "fpga/V2/driver/pe.h" #include "framework/tensor.h" - namespace paddle_mobile { namespace fpga { @@ -36,7 +35,6 @@ enum LayoutType { LAYOUT_HWC = 0, }; - struct KernelArgs { uint32_t width; uint32_t height; @@ -122,8 +120,6 @@ struct BypassArgs { struct ImageOutputArgs output; }; - - int open_device(); int close_device(); void* fpga_malloc(size_t size); diff --git a/src/fpga/V2/config.h b/src/fpga/V2/config.h index 56429b2848487fe5b29a23c190fa820499b99871..27187c7b854c84d501949db41fe89f9dca1d2bf1 100644 --- a/src/fpga/V2/config.h +++ b/src/fpga/V2/config.h @@ -14,5 +14,5 @@ limitations under the License. */ #pragma once -//#define PADDLE_MOBILE_ZU5 +#define PADDLE_MOBILE_ZU5 #define FPGA_PRINT_MODE diff --git a/src/fpga/V2/driver/pe.cpp b/src/fpga/V2/driver/pe.cpp index 63a0a9ff9423f55e629418d71226f2fba40640ec..52cde04601bc5e002ce2d8e15b3bdb1ce64b340a 100644 --- a/src/fpga/V2/driver/pe.cpp +++ b/src/fpga/V2/driver/pe.cpp @@ -13,13 +13,49 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V2/driver/pe.h" +#include "fpga/V2/config.h" #include "fpga/V2/driver/driver.h" +#include "fpga/V2/filter.h" #include "fpga/V2/image.h" -#include "fpga/V2/config.h" namespace paddle_mobile { namespace fpga { -#define MUL8(x) x*8 +#define MUL8(x) (x * 8) +#define BYPASS_DONE 1 + +float Findfp16Max() { + uint16_t abs_vals[16]; + uint64_t max_fp16; + + max_fp16 = reg_readq(MUL8(49)); + abs_vals[0] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[1] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[2] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[3] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(50)); + abs_vals[4] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[5] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[6] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[7] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(51)); + abs_vals[8] = (uint16_t)(0x0000007f & (max_fp16)); // NOLINT + abs_vals[9] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[10] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[11] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + max_fp16 = reg_readq(MUL8(52)); + abs_vals[12] = (uint16_t)(0x0000007f & (max_fp16)); + abs_vals[13] = (uint16_t)(0x0000007f & (max_fp16 >> 16)); // NOLINT + abs_vals[14] = (uint16_t)(0x0000007f & (max_fp16 >> 32)); // NOLINT + abs_vals[15] = (uint16_t)(0x0000007f & (max_fp16 >> 48)); // NOLINT + + uint16_t tmp = 0; + for (int i = 0; i < 16; i++) { + if (tmp < abs_vals[i]) { + tmp = abs_vals[i]; + } + } + return fp16_2_fp32(tmp) / 127.0f; +} int ComputeFpgaConv(const struct SplitConvArgs &args) { ComputeBasicConv(args.conv_args[0]); @@ -129,26 +165,55 @@ int PerformBypass(const struct BypassArgs &args) { #ifndef PADDLE_MOBILE_ZU5 return 0; #endif - uint64_t bp_enable = 0x8800000000000000; - uint64_t *ifm_src_paddr = (uint64_t *)vaddr_to_paddr(args.image.address); // NOLINT - uint64_t *ifm_dst_paddr = (uint64_t *)vaddr_to_paddr(args.output.address); // NOLINT - uint64_t length = (args.image.channels)*(args.image.width)*(args.image.height); - bp_enable += length; + uint64_t ifm_src_paddr = vaddr_to_paddr(args.image.address); + uint64_t ifm_dst_paddr = vaddr_to_paddr(args.output.address); + uint64_t bp_enable; + int64_t length; + uint64_t pixels; + + // fp32->fp16 + if ((args.input_data_type) && (!args.output_data_type)) { + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(float); + bp_enable = 0x8800000000000000 + length; + } + // fp16->fp32 + else if ((!args.input_data_type) && (args.output_data_type)) { + pixels = filter::calc_aligned_channel((args.image.channels)) * + (args.image.width) * (args.image.height); + length = pixels * sizeof(short); + length = align_to_x((int)length, 64); // NOLINT + bp_enable = 0x8a00000000000000 + length; + } + // fp16->fp16 findmax + else if ((!args.input_data_type) && (!args.output_data_type)) { + pixels = (args.image.channels) * (args.image.width) * (args.image.height); + length = pixels * sizeof(short); + bp_enable = 0x8900000000000000 + length; + } else { + return -1; + } - reg_writeq(*ifm_src_paddr, MUL8(27)); - reg_writeq(*ifm_dst_paddr, MUL8(28)); + // start bypass + reg_writeq(ifm_src_paddr, MUL8(27)); + reg_writeq(ifm_dst_paddr, MUL8(28)); reg_writeq(0, MUL8(0)); reg_writeq(bp_enable, MUL8(0)); - //poll + // poll int ret = -1; - ret = fpga_regpoll(MUL8(48),1, 0xffffffff); - if(ret != -1) { + ret = fpga_regpoll(MUL8(48), BYPASS_DONE, 0xffffffff); + if (ret != -1) { + // clear "irq" reg_readq(MUL8(63)); } - + // get max value + if ((!args.input_data_type) && (!args.output_data_type)) { + float scale = Findfp16Max(); + args.output.scale_address[0] = (float)(1.0 / scale); // NOLINT + args.output.scale_address[1] = scale; + } return ret; - } int ComputeFPGAConcat(const struct ConcatArgs &args) {